Exemple #1
0
 def run_aligner(self):
     """ Run the aligner. """
     if all([self.params_are_valid(), self.exec_is_valid()]):
         if not self.output_exists():
             run_oe(self.compile_command(), self.out_file, self.out_log)
         else:
             if self.overwrite:
                 log("overwriting pre-existing file: " + self.out_file)
                 run_oe(self.compile_command(), self.out_file, self.out_log)
             else:
                 log("Retaining pre-existing file: " + self.out_file)
Exemple #2
0
def run_samtools(output_path, num_threads, overwrite_files):
    """ Compress, sort and index alignments with pysam. """
    if os.path.isfile(output_path + "c_reads_against_query.s.bam"):
        if not overwrite_files:
            log("Retaining pre-existing file: " + output_path +
                "c_reads_against_query.s.bam")
        else:
            log("Overwriting pre-existing file: " + output_path +
                "c_reads_against_query.s.bam")
            pysam.view("-@",
                       str(num_threads),
                       "-b",
                       "-o",
                       output_path + "c_reads_against_query.bam",
                       output_path + "c_reads_against_query.sam",
                       catch_stdout=False)
            pysam.sort("-@",
                       str(num_threads),
                       "-o",
                       output_path + "c_reads_against_query.s.bam",
                       output_path + "c_reads_against_query.bam",
                       catch_stdout=False)
    else:
        pysam.view("-@",
                   str(num_threads),
                   "-b",
                   "-o",
                   output_path + "c_reads_against_query.bam",
                   output_path + "c_reads_against_query.sam",
                   catch_stdout=False)
        pysam.sort("-@",
                   str(num_threads),
                   "-o",
                   output_path + "c_reads_against_query.s.bam",
                   output_path + "c_reads_against_query.bam",
                   catch_stdout=False)

    log("Indexing read alignments")
    if os.path.isfile(output_path + "c_reads_against_query.s.bam.bai"):
        if not overwrite_files:
            log("Retaining pre-existing file: " + output_path +
                "c_reads_against_query.s.bam.bai")
        else:
            log("Overwriting pre-existing file: " + output_path +
                "c_reads_against_query.s.bam.bai")
            pysam.index(output_path + "c_reads_against_query.s.bam",
                        catch_stdout=False)
    else:
        pysam.index(output_path + "c_reads_against_query.s.bam",
                    catch_stdout=False)
Exemple #3
0
    def params_are_valid(self):
        """
        Do a basic check to make sure the unimap parameters are valid.
        I won't check that every parameter is valid, but will check anything that can
        cause a problem for RagTag later on.
        :return: True if the parameters are valid. Raises appropriate errors otherwise
        """
        all_flags = "".join([i for i in self.params_string.split(" ") if i.startswith("-")])
        if "a" in all_flags:
            raise ValueError("Alignments must not be in SAM format (-a).")

        if "c" in all_flags:
            log("WARNING", "Computing base-alignments (-c) will slow down Unimap alignment.")

        return True
Exemple #4
0
def make_gff_interval_tree(gff_file):
    # Dictionary storing an interval tree for each sequence header
    t = defaultdict(IntervalTree)

    # Iterate over the gff file
    with open(gff_file, "r") as f:
        for line in f:
            if not line.startswith("#"):
                fields = line.split("\t")
                h, start, end = fields[0], int(fields[3]), int(fields[4])
                start = start - 1  # make everything zero-indexed
                assert start < end

                if end - start > 100000:
                    coords = "%s:%d-%d" %(h, start+1, end)
                    log("WARNING: large interval in this gff file (%s). This could disproportionately invalidate putative query breakpoints." % coords)
                t[h][start:end] = (start, end)

    return t
Exemple #5
0
def get_median_read_coverage(output_path, num_threads, overwrite_files):
    """ Given the read alignments, use samtools stats to return an approximate median coverage value. """
    log("Calculating global read coverage")
    if os.path.isfile(output_path + "c_reads_against_query.s.bam.stats"):
        if not overwrite_files:
            log("retaining pre-existing file: " + output_path + "c_reads_against_query.s.bam.stats")
        else:
            log("overwriting pre-existing file: " + output_path + "c_reads_against_query.s.bam.stats")
            st = pysam.stats("-@", str(num_threads), output_path + "c_reads_against_query.s.bam")
            with open(output_path + "c_reads_against_query.s.bam.stats", "w") as f:
                f.write(st)
    else:
        st = pysam.stats("-@", str(num_threads), output_path + "c_reads_against_query.s.bam")
        with open(output_path + "c_reads_against_query.s.bam.stats", "w") as f:
            f.write(st)

    # Get the coverage histogram (for 1 to 1k)
    covs = []
    with open(output_path + "c_reads_against_query.s.bam.stats") as f:
        for line in f:
            if line.startswith("COV"):
                covs.append(int(line.split("\t")[3]))

    # Get the median from the histogram
    covs = np.asarray(covs, dtype=np.int32)

    # Remove the last value, which is a catch-all for coverages > 1k
    covs = covs[:-1]
    mid = sum(covs) // 2
    cs = 0
    for i in range(len(covs)):
        cs += covs[i]
        if cs >= mid:
            return i
    raise ValueError("Unable to calculate read coverage. Check SAM/BAM files and stats file.")
def main():
    parser = argparse.ArgumentParser(
        description="Update gff intervals given a RagTag AGP file",
        usage="ragtag.py updategff [-c] <genes.gff> <ragtag.agp>")
    parser.add_argument("gff",
                        nargs='?',
                        default="",
                        metavar="<genes.gff>",
                        type=str,
                        help="gff file")
    parser.add_argument("agp",
                        nargs='?',
                        default="",
                        metavar="<ragtag.*.agp>",
                        type=str,
                        help="agp file")
    parser.add_argument(
        "-c",
        action="store_true",
        default=False,
        help="update for misassembly correction (ragtag.correction.agp)")

    args = parser.parse_args()

    if not args.gff or not args.agp:
        parser.print_help()
        sys.exit()

    log("RagTag " + get_ragtag_version())
    log("CMD: " + " ".join(sys.argv))

    gff_file = os.path.abspath(args.gff)
    agp_file = os.path.abspath(args.agp)
    is_sub = args.c

    if is_sub:
        sub_update(gff_file, agp_file)
    else:
        sup_update(gff_file, agp_file)
Exemple #7
0
def main():
    parser = argparse.ArgumentParser(description="Check AGP v2.1 files for validity.", usage="ragtag.py agpcheck <asm1.agp> [<asm2.agp> ... <asmN.agp>]")
    parser.add_argument("agp", metavar="<asm1.agp> [<asm2.agp> ... <asmN.agp>]", nargs='+', default=[], type=str, help="AGP v2.1 files")

    DISCLAIMER = """
    DISCLAIMER:
    This utility performs most (but not all) checks necessary to validate an
    AGP v2.1 file: https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Specification/
    
    Please additionally use the NCBI AGP validator for robust
    validation: https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Validation/
    """

    args = parser.parse_args()

    print(DISCLAIMER)
    agp_file_list = [os.path.abspath(i) for i in args.agp]
    for agp_file in agp_file_list:
        print()
        log("INFO", "Checking {} ...".format(agp_file))
        agp = AGPFile(agp_file, mode="r")
        for _ in agp.iterate_lines():
            pass
        log("INFO", "Check for {} is complete with no errors.".format(agp_file))
Exemple #8
0
def write_breaks(out_file, query_file, ctg_breaks, overwrite, remove_suffix):
    """ Write the intermediate file for contig breaks in AGP v2.1 format."""
    # Check if the output file already exists
    if os.path.isfile(out_file):
        if not overwrite:
            log("Retaining pre-existing file: " + out_file)
            return

        else:
            log("Overwriting pre-existing file: " + out_file)

    fai = pysam.FastaFile(query_file)
    all_q_seqs = sorted(fai.references)
    agp = AGPFile(out_file, mode="w")

    agp.add_pragma()
    agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version()))

    for q in all_q_seqs:

        # Check if this sequence was broken during misassembly correction
        if q not in ctg_breaks:

            # Add suffix to query header, unless otherwise requested
            unchanged_comp_header = q
            if not remove_suffix:
                unchanged_comp_header = q + ":0" + "-" + str(
                    fai.get_reference_length(q)) + "(+)"

            agp.add_seq_line(q, "1", str(fai.get_reference_length(q)), "1",
                             "W", unchanged_comp_header, "1",
                             str(fai.get_reference_length(q)), "+")
        else:  # This query sequence was broken
            pid = 1
            sorted_breaks = sorted(ctg_breaks[q])
            start = 0
            for i in sorted_breaks:
                agp.add_seq_line(q, str(start + 1), str(i), str(pid), "W",
                                 q + ":" + str(start) + "-" + str(i) + "(+)",
                                 "1", str(i - start), "+")
                start = i
                pid += 1

            # Add one line for the last interval
            agp.add_seq_line(
                q, str(start + 1), str(fai.get_reference_length(q)), str(pid),
                "W", q + ":" + str(start) + "-" +
                str(fai.get_reference_length(q)) + "(+)", "1",
                str(fai.get_reference_length(q) - start), "+")

    log("Writing: " + out_file)
    agp.write()
    fai.close()
Exemple #9
0
def main():
    parser = argparse.ArgumentParser(
        description='Reference-guided misassembly correction',
        usage="ragtag.py correct <reference.fa> <query.fa>")

    cor_options = parser.add_argument_group("correction options")
    cor_options.add_argument(
        "reference",
        metavar="<reference.fa>",
        nargs='?',
        default="",
        type=str,
        help="reference fasta file (can be uncompressed or bgzipped)")
    cor_options.add_argument(
        "query",
        metavar="<query.fa>",
        nargs='?',
        default="",
        type=str,
        help="query fasta file (can be uncompressed or bgzipped)")
    cor_options.add_argument("-f",
                             metavar="INT",
                             type=int,
                             default=1000,
                             help="minimum unique alignment length [1000]")
    cor_options.add_argument("--remove-small",
                             action="store_true",
                             default=False,
                             help="remove unique alignments shorter than -f")
    cor_options.add_argument(
        "-q",
        metavar="INT",
        type=int,
        default=10,
        help="minimum mapq (NA for Nucmer alignments) [10]")
    cor_options.add_argument("-d",
                             metavar="INT",
                             type=int,
                             default=100000,
                             help="alignment merge distance [100000]")
    cor_options.add_argument(
        "-b",
        metavar="INT",
        type=int,
        default=5000,
        help="minimum break distance from contig ends [5000]")
    cor_options.add_argument("-e",
                             metavar="<exclude.txt>",
                             type=str,
                             default="",
                             help="list of reference headers to ignore")
    cor_options.add_argument("-j",
                             metavar="<skip.txt>",
                             type=str,
                             default="",
                             help="list of query headers to leave uncorrected")
    cor_options.add_argument(
        "--inter",
        action="store_true",
        default=False,
        help="only break misassemblies between reference sequences")
    cor_options.add_argument(
        "--intra",
        action="store_true",
        default=False,
        help="only break misassemblies within reference sequences")
    cor_options.add_argument("--gff",
                             metavar="<features.gff>",
                             type=str,
                             default="",
                             help="don't break sequences within gff intervals")

    io_options = parser.add_argument_group("input/output options")
    io_options.add_argument("-o",
                            metavar="PATH",
                            type=str,
                            default="ragtag_output",
                            help="output directory [./ragtag_output]")
    io_options.add_argument("-w",
                            action='store_true',
                            default=False,
                            help="overwrite intermediate files")
    io_options.add_argument("-u",
                            action='store_true',
                            default=False,
                            help="add suffix to unaltered sequence headers")
    io_options.add_argument("--debug",
                            action='store_true',
                            default=False,
                            help=argparse.SUPPRESS)

    aln_options = parser.add_argument_group("mapping options")
    mm2_default = "-x asm5"
    aln_options.add_argument("-t",
                             metavar="INT",
                             type=int,
                             default=1,
                             help="number of minimap2 threads [1]")
    aln_options.add_argument(
        "--aligner",
        metavar="PATH",
        type=str,
        default="minimap2",
        help=
        "whole genome aligner executable ('nucmer' or 'minimap2') [minimap2]")
    aln_options.add_argument(
        "--mm2-params",
        metavar="STR",
        type=str,
        default=mm2_default,
        help="space delimited minimap2 whole genome alignment parameters ['%s']"
        % mm2_default)
    aln_options.add_argument(
        "--nucmer-params",
        metavar="STR",
        type=str,
        default="-l 100 -c 500",
        help=
        "space delimted nucmer whole genome alignment parameters ['-l 100 -c 500']"
    )

    val_options = parser.add_argument_group("validation options")
    val_options.add_argument(
        "--read-aligner",
        metavar="PATH",
        type=str,
        default="minimap2",
        help="read aligner executable (only 'minimap2' is allowed) [minimap2]")
    val_options.add_argument(
        "-R",
        metavar="<reads.fasta>",
        type=str,
        default="",
        help="validation reads. gzipped fastq or fasta allowed.")
    val_options.add_argument("-F",
                             metavar="<reads.fofn>",
                             type=str,
                             default="",
                             help="same as '-R', but a list of files.")
    val_options.add_argument(
        "-T",
        metavar="sr",
        type=str,
        default="",
        help=
        "read type. 'sr' and 'corr' accepted for short reads and error corrected long-reads, respectively."
    )
    val_options.add_argument("-v",
                             metavar="INT",
                             type=int,
                             default=10000,
                             help="coverage validation window size [10000]")
    val_options.add_argument(
        "--max-cov",
        metavar="INT",
        type=int,
        default=-1,
        help="break sequences at regions at or above this coverage level [AUTO]"
    )
    val_options.add_argument(
        "--min-cov",
        metavar="INT",
        type=int,
        default=-1,
        help="break sequences at regions at or below this coverage level [AUTO]"
    )
    val_options.add_argument(
        "-m", metavar="INT", type=int, default=1000, help=argparse.SUPPRESS
    )  # Merge breakpoints within this distance after validation

    args = parser.parse_args()

    if not args.reference or not args.query:
        parser.print_help()
        sys.exit()

    log("RagTag " + get_ragtag_version())
    log("CMD: " + " ".join(sys.argv))

    reference_file = os.path.abspath(args.reference)
    query_file = os.path.abspath(args.query)

    # Check that the reference/query file exists
    if not os.path.isfile(reference_file):
        raise ValueError("Could not find file: %s" % reference_file)

    if not os.path.isfile(query_file):
        raise ValueError("Could not find file: %s" % query_file)

    num_threads = args.t
    min_ulen = args.f
    keep_small_uniques = not args.remove_small
    merge_dist = args.d
    min_break_dist = args.m
    min_break_end_dist = args.b
    val_window_size = args.v

    # I/O options
    output_path = args.o
    if not os.path.isdir(output_path):
        os.mkdir(output_path)
    output_path = os.path.abspath(output_path) + "/"

    overwrite_files = args.w
    remove_suffix = not args.u
    if remove_suffix:
        log("WARNING: Without '-u' invoked, some component/object AGP pairs might share the same ID. Some external programs/databases don't like this. To ensure valid AGP format, use '-u'."
            )

    gff_file = args.gff
    if gff_file:
        gff_file = os.path.abspath(gff_file)

    # Skip/exclude options
    query_blacklist = set()
    skip_file = args.j
    if skip_file:
        skip_file = os.path.abspath(args.j)
        with open(skip_file, "r") as f:
            for line in f:
                query_blacklist.add(line.rstrip())

    ref_blacklist = set()
    exclude_file = args.e
    if exclude_file:
        exclude_file = os.path.abspath(args.e)
        with open(exclude_file, "r") as f:
            for line in f:
                ref_blacklist.add(line.rstrip())

    # Get aligner arguments
    genome_aligner_path = args.aligner
    genome_aligner = genome_aligner_path.split("/")[-1]
    if genome_aligner.split("/")[-1] not in {'minimap2', 'nucmer'}:
        raise ValueError(
            "Must specify either 'minimap2' or 'nucmer' (PATHs allowed) with '--aligner'."
        )

    mm2_params = args.mm2_params
    nucmer_params = args.nucmer_params

    # Mapq filtering params
    min_mapq = args.q
    if genome_aligner == "nucmer":
        min_mapq = 0

    # Add the number of mm2 threads if the mm2 params haven't been overridden.
    if mm2_params == mm2_default:
        mm2_params += " -t " + str(num_threads)

    # Check if intra/inter breaking is desired
    break_intra = True
    break_inter = True
    only_intra = args.intra
    only_inter = args.inter
    if only_intra and only_inter:
        raise ValueError(
            "Must speficity either '--inter' or '--intra', not both.")

    if only_intra:
        break_inter = False
    if only_inter:
        break_intra = False

    # read-alignment parameters
    val_reads = args.R
    val_reads_fofn = args.F
    val_reads_tech = args.T
    read_aligner_path = args.read_aligner
    read_aligner = read_aligner_path.split("/")[-1]
    if read_aligner != "minimap2":
        raise ValueError(
            "Only minimap2 can be used for read alignments. got: %s" %
            read_aligner)

    # If the genome aligner is minimap2, we can just use that path for read alignment
    if genome_aligner == 'minimap2':
        read_aligner_path = genome_aligner_path

    # Make sure that if -R or -F, -T has been specified.
    if val_reads or val_reads_fofn:
        if not val_reads_tech:
            raise ValueError("'-T' must be provided when using -R or -F.")

    # Make a list of read sequences.
    read_files = []
    if val_reads_fofn:
        with open(val_reads_fofn, "r") as f:
            for line in f:
                read_files.append(os.path.abspath(line.rstrip()))
    elif val_reads:
        read_files.append(os.path.abspath(val_reads))

    # Coverage thresholds
    max_cov = args.max_cov
    min_cov = args.min_cov

    if max_cov < 0:
        if max_cov != -1:
            raise ValueError("--max-cov must be >=0")

    if min_cov < 0:
        if min_cov != -1:
            raise ValueError("--min-cov must be >=0")

    # Debugging options
    debug_mode = args.debug
    debug_non_fltrd_file = output_path + "ragtag.correction.debug.unfiltered.paf"
    debug_fltrd_file = output_path + "ragtag.correction.debug.filtered.paf"
    debug_merged_file = output_path + "ragtag.correction.debug.merged.paf"
    debug_query_info_file = output_path + "ragtag.correction.debug.query.info.txt"

    # Align the query to the reference.
    log("Mapping the query genome to the reference genome")
    if genome_aligner == "minimap2":
        al = Minimap2Aligner(reference_file, [query_file],
                             genome_aligner_path,
                             mm2_params,
                             output_path + "c_query_against_ref",
                             in_overwrite=overwrite_files)
    else:
        al = NucmerAligner(reference_file, [query_file],
                           genome_aligner_path,
                           nucmer_params,
                           output_path + "c_query_against_ref",
                           in_overwrite=overwrite_files)
    al.run_aligner()

    # If alignments are from Nucmer, convert from delta to paf.
    if genome_aligner == "nucmer":
        cmd = [
            "ragtag_delta2paf.py", output_path + "c_query_against_ref.delta"
        ]
        run_o(
            cmd,
            output_path + "c_query_against_ref.paf",
        )

    # Read and organize the alignments.
    log('Reading whole genome alignments')
    # ctg_alns = dict :: key=query header, value=ContigAlignment object
    ctg_alns = read_genome_alignments(output_path + "c_query_against_ref.paf",
                                      query_blacklist, ref_blacklist)

    # Filter and merge the alignments.
    if debug_mode:
        # create new empty copies of debugging output files
        open(debug_non_fltrd_file, "w").close()
        open(debug_fltrd_file, "w").close()
        open(debug_merged_file, "w").close()
        open(debug_query_info_file, "w").close()

    log("Filtering and merging alignments")
    for i in ctg_alns:

        # Write unfiltered alignments
        if debug_mode:
            with open(debug_non_fltrd_file, "a") as f:
                f.write(str(ctg_alns[i]))

        ctg_alns[i] = ctg_alns[i].unique_anchor_filter(
            min_ulen, keep_small=keep_small_uniques)
        if ctg_alns[i] is not None:
            ctg_alns[i] = ctg_alns[i].filter_mapq(min_mapq)
            if ctg_alns[i] is not None:

                # Write filtered alignments
                if debug_mode:
                    with open(debug_fltrd_file, "a") as f:
                        f.write(str(ctg_alns[i]))

                ctg_alns[i] = ctg_alns[i].merge_alns(merge_dist=merge_dist)

    # Get the putative breakpoints for each query sequence, if any.
    ctg_breaks = dict()
    for i in ctg_alns:
        if ctg_alns[i] is not None:

            # Write merged alignments and confidence scores
            if debug_mode:
                with open(debug_merged_file, "a") as f:
                    f.write(str(ctg_alns[i]))

                with open(debug_query_info_file, "a") as f:
                    f.write("\t".join([
                        i,
                        ctg_alns[i].best_ref_header,
                        str(ctg_alns[i].grouping_confidence),
                        str(ctg_alns[i].location_confidence),
                        str(ctg_alns[i].orientation_confidence),
                    ]) + "\n")

            breaks = []
            intra_breaks, inter_breaks = ctg_alns[i].get_break_candidates(
                min_dist=min_break_end_dist)
            if break_intra:
                breaks = breaks + intra_breaks
            if break_inter:
                breaks = breaks + inter_breaks
            if breaks:
                ctg_breaks[i] = breaks

    # If desired, validate the putative breakpoints by observing read coverage.
    if read_files:
        log("Validating putative query breakpoints via read alignment.")
        log("Aligning reads to query sequences.")
        if not os.path.isfile(output_path + "c_reads_against_query.s.bam"):
            if val_reads_tech == "sr":
                al = Minimap2SAMAligner(query_file,
                                        read_files,
                                        read_aligner_path,
                                        "-ax sr -t " + str(num_threads),
                                        output_path + "c_reads_against_query",
                                        in_overwrite=overwrite_files)
            elif val_reads_tech == "corr":
                al = Minimap2SAMAligner(query_file,
                                        read_files,
                                        read_aligner_path,
                                        "-ax asm5 -t " + str(num_threads),
                                        output_path + "c_reads_against_query",
                                        in_overwrite=overwrite_files)
            else:
                raise ValueError("'-T' must be either 'sr' or 'corr'.")
            al.run_aligner()
        else:
            log("Retaining pre-existing read alignments: " + output_path +
                "c_reads_against_query.s.bam")

        # Compress, sort and index the alignments.
        log("Compressing, sorting, and indexing read alignments")
        run_samtools(output_path, num_threads, overwrite_files)

        # Validate the breakpoints
        log("Validating putative query breakpoints")

        # Give at least 10k/1k from ctg ends for coverage to accumulate for corr and sr, respectively.
        val_min_break_end_dist = min_break_end_dist
        if val_reads_tech == "corr":
            val_min_break_end_dist = max(10000, min_break_end_dist)
        if val_reads_tech == "sr":
            val_min_break_end_dist = max(1000, min_break_end_dist)

        # Validate the breakpoints
        ctg_breaks = validate_breaks(ctg_breaks,
                                     output_path,
                                     num_threads,
                                     overwrite_files,
                                     val_min_break_end_dist,
                                     max_cov,
                                     min_cov,
                                     window_size=val_window_size,
                                     clean_dist=min_break_dist,
                                     debug=debug_mode)

    # Check if we need to avoid gff intervals
    if gff_file:
        log("Avoiding breaks within GFF intervals")
        it = make_gff_interval_tree(gff_file)
        non_gff_breaks = dict()
        for ctg in ctg_breaks:
            new_breaks = []
            for i in ctg_breaks[ctg]:
                if it[ctg][i]:
                    log("Avoiding breaking %s at %d. This point intersects a feature in the gff file."
                        % (ctg, i))
                else:
                    new_breaks.append(i)
            if new_breaks:
                non_gff_breaks[ctg] = new_breaks
        ctg_breaks = non_gff_breaks

    # Write the summary of query sequence breaks in AGP format
    agp_file = output_path + "ragtag.correction.agp"
    write_breaks(agp_file, query_file, ctg_breaks, overwrite_files,
                 remove_suffix)

    # Write the scaffolds.
    log("Writing broken contigs")
    qf_name = query_file.split("/")[-1]
    qf_pref = qf_name[:qf_name.rfind(".")]
    cmd = ["ragtag_break_query.py", agp_file, query_file]
    run_o(cmd, output_path + qf_pref + ".corrected.fasta")

    log("Goodbye")
Exemple #10
0
def validate_breaks(ctg_breaks,
                    output_path,
                    num_threads,
                    overwrite_files,
                    min_break_end_dist,
                    max_cutoff,
                    min_cutoff,
                    window_size=10000,
                    num_devs=3,
                    clean_dist=1000,
                    debug=False):
    """
    """
    # Get the median coverage over all bp
    glob_med = get_median_read_coverage(output_path, num_threads,
                                        overwrite_files)
    dev = round(math.sqrt(glob_med))

    if max_cutoff == -1:
        max_cutoff = glob_med + (num_devs * dev)

    if min_cutoff == -1:
        min_cutoff = max(0, (glob_med - (num_devs * dev)))

    log("The global median read coverage is %dX" % glob_med)
    log("The max and min coverage thresholds are %dX and %dX, respectively" %
        (max_cutoff, min_cutoff))

    # Go through each break point and query the coverage within the vicinity of the breakpoint.
    bam = pysam.AlignmentFile(output_path + "c_reads_against_query.s.bam")
    validated_ctg_breaks = dict()
    for ctg in ctg_breaks:
        val_breaks = []

        # Iterate over each breakpoint for this query sequence
        for b in ctg_breaks[ctg]:
            # Don't extend the validation window too close to the contig ends (defined by min_break_end_dist)
            min_range = max(min_break_end_dist, b - (window_size // 2))
            max_range = min(
                (bam.get_reference_length(ctg) - min_break_end_dist),
                b + (window_size // 2))

            if min_range >= max_range:
                continue

            region = "%s:%d-%d" % (ctg, min_range, max_range - 1)
            depth_out = pysam.samtools.depth(
                "-aa", "-r", region,
                output_path + "c_reads_against_query.s.bam")
            covs = np.asarray([
                j.split("\t")[2]
                for j in [i for i in depth_out.rstrip().split("\n")]
            ],
                              dtype=np.int32)
            assert len(covs) == max_range - min_range

            # Given the coverage in vicinity of the breakpoint, find the max and min coverage.
            cov_min, cov_max = np.min(covs), np.max(covs)
            too_high = True if cov_max >= max_cutoff else False
            too_low = True if cov_min <= min_cutoff else False
            new_break = None
            status = "not validated"
            if too_low and too_high:
                val_breaks.append(np.argmin(covs) + min_range)
                new_break = np.argmin(covs) + min_range
                status = "low and high cov"
            elif too_low:
                val_breaks.append(np.argmin(covs) + min_range)
                new_break = np.argmin(covs) + min_range
                status = "low cov"
            elif too_high:
                val_breaks.append(np.argmax(covs) + min_range)
                new_break = np.argmax(covs) + min_range
                status = "high cov"

            if debug:
                log("query: %s, original break: %s, window start: %d, window end: %d, status: %s, new_break: %s, cov max: %d, cov min: %d"
                    % (ctg, b, min_range, max_range, status, str(new_break),
                       cov_max, cov_min))

        validated_ctg_breaks[ctg] = clean_breaks(val_breaks, clean_dist)

    return validated_ctg_breaks
Exemple #11
0
def write_orderings(out_agp_file, out_confidence_file, query_file,
                    ordering_dict, ctg_dict, gap_dict, gap_type_dict,
                    make_chr0, overwrite, add_suffix):
    # Check if the output file already exists
    if os.path.isfile(out_agp_file):
        if not overwrite:
            log("Retaining pre-existing file: " + out_agp_file)
            return

        else:
            log("Overwriting pre-existing file: " + out_agp_file)

    # Proceed with writing the intermediate output
    placed_seqs = set()
    all_out_cs_lines = []  # For confidence scores
    agp = AGPFile(out_agp_file, mode="w")

    agp.add_pragma()
    agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version()))

    # Go through the reference sequences in sorted order
    sorted_ref_headers = sorted(list(ordering_dict.keys()))
    for ref_header in sorted_ref_headers:
        pid = 1
        pos = 0
        new_ref_header = ref_header + "_RagTag"
        q_seqs = ordering_dict[ref_header]
        gap_seqs = gap_dict[ref_header]
        gap_types = gap_type_dict[ref_header]

        # Iterate through the query sequences for this reference header
        for i in range(len(q_seqs)):
            out_agp_line = []
            out_cs_line = []
            q = q_seqs[i][2]
            placed_seqs.add(q)
            qlen = ctg_dict[q].query_len
            strand = ctg_dict[q].orientation
            gc, lc, oc = ctg_dict[q].grouping_confidence, ctg_dict[
                q].location_confidence, ctg_dict[q].orientation_confidence
            out_agp_line.append(new_ref_header)
            out_agp_line.append(str(pos + 1))
            pos += qlen
            out_agp_line.append(str(pos))
            out_agp_line.append(str(pid))
            out_agp_line.append("W")
            out_agp_line.append(q)
            out_agp_line.append("1")
            out_agp_line.append(str(ctg_dict[q].query_len))
            out_agp_line.append(strand)

            # Save the confidence score info
            out_cs_line.append(q)
            out_cs_line.append(str(gc))
            out_cs_line.append(str(lc))
            out_cs_line.append(str(oc))

            agp.add_seq_line(*out_agp_line)
            all_out_cs_lines.append("\t".join(out_cs_line))
            pid += 1

            if i < len(gap_seqs):
                # Print the gap line
                out_agp_line = []
                out_agp_line.append(new_ref_header)
                out_agp_line.append(str(pos + 1))
                pos += gap_seqs[i]
                out_agp_line.append(str(pos))
                out_agp_line.append(str(pid))
                gap_type = gap_types[i]
                out_agp_line.append(gap_type)
                out_agp_line.append(str(gap_seqs[i]))
                out_agp_line.append("scaffold")
                out_agp_line.append("yes")
                out_agp_line.append("align_genus")
                pid += 1
                agp.add_gap_line(*out_agp_line)

    # Write unplaced sequences
    fai = pysam.FastaFile(query_file)
    all_seqs = set(fai.references)
    unplaced_seqs = sorted(list(all_seqs - placed_seqs))
    if unplaced_seqs:
        if make_chr0:
            pos = 0
            pid = 1
            new_ref_header = "Chr0_RagTag"
            for q in unplaced_seqs:
                out_agp_line = []
                qlen = fai.get_reference_length(q)
                out_agp_line.append(new_ref_header)
                out_agp_line.append(str(pos + 1))
                pos += qlen
                out_agp_line.append(str(pos))
                out_agp_line.append(str(pid))
                out_agp_line.append("W")
                out_agp_line.append(q)
                out_agp_line.append("1")
                out_agp_line.append(str(qlen))
                out_agp_line.append("+")

                agp.add_seq_line(*out_agp_line)
                pid += 1

                # Now for the gap, since we are making a chr0
                out_agp_line = []
                out_agp_line.append(new_ref_header)
                out_agp_line.append(str(pos + 1))
                pos += 100
                out_agp_line.append(str(pos))
                out_agp_line.append(str(pid))
                out_agp_line.append("U")
                out_agp_line.append("100")
                out_agp_line.append("contig")
                out_agp_line.append("no")
                out_agp_line.append("na")

                agp.add_gap_line(*out_agp_line)
                pid += 1

            # Remove the final unecessary gap
            agp.pop_agp_line()
        else:
            # List the unplaced contigs individually
            for q in unplaced_seqs:
                out_agp_line = []
                qlen = fai.get_reference_length(q)
                if add_suffix:
                    out_agp_line.append(q + "_RagTag")
                else:
                    out_agp_line.append(q)
                out_agp_line.append("1")
                out_agp_line.append(str(qlen))
                out_agp_line.append("1")
                out_agp_line.append("W")
                out_agp_line.append(q)
                out_agp_line.append("1")
                out_agp_line.append(str(qlen))
                out_agp_line.append("+")
                agp.add_seq_line(*out_agp_line)

    agp.write()
    fai.close()

    # Write the confidence scores
    with open(out_confidence_file, "w") as f:
        f.write(
            "query\tgrouping_confidence\tlocation_confidence\torientation_confidence\n"
        )
        f.write("\n".join(all_out_cs_lines) + "\n")
Exemple #12
0
def main():
    parser = argparse.ArgumentParser(
        description='Reference-guided scaffolding',
        usage="ragtag.py scaffold <reference.fa> <query.fa>")

    parser.add_argument("reference",
                        metavar="<reference.fa>",
                        nargs='?',
                        default="",
                        type=str,
                        help="reference fasta file (uncompressed or bgzipped)")
    parser.add_argument("query",
                        metavar="<query.fa>",
                        nargs='?',
                        default="",
                        type=str,
                        help="query fasta file (uncompressed or bgzipped)")

    scaf_options = parser.add_argument_group("scaffolding options")
    scaf_options.add_argument(
        "-e",
        metavar="<exclude.txt>",
        type=str,
        default="",
        help="list of reference headers to ignore [null]")
    scaf_options.add_argument(
        "-j",
        metavar="<skip.txt>",
        type=str,
        default="",
        help="list of query headers to leave unplaced [null]")
    scaf_options.add_argument("-f",
                              metavar="INT",
                              type=int,
                              default=1000,
                              help="minimum unique alignment length [1000]")
    scaf_options.add_argument("--remove-small",
                              action="store_true",
                              default=False,
                              help="remove unique alignments shorter than -f")
    scaf_options.add_argument(
        "-q",
        metavar="INT",
        type=int,
        default=10,
        help="minimum mapq (NA for Nucmer alignments) [10]")
    scaf_options.add_argument("-d",
                              metavar="INT",
                              type=int,
                              default=100000,
                              help="alignment merge distance [100000]")
    scaf_options.add_argument("-i",
                              metavar="FLOAT",
                              type=float,
                              default=0.2,
                              help="minimum grouping confidence score [0.2]")
    scaf_options.add_argument("-a",
                              metavar="FLOAT",
                              type=float,
                              default=0.0,
                              help="minimum location confidence score [0.0]")
    scaf_options.add_argument(
        "-s",
        metavar="FLOAT",
        type=float,
        default=0.0,
        help="minimum orientation confidence score [0.0]")
    scaf_options.add_argument(
        "-C",
        action='store_true',
        default=False,
        help="concatenate unplaced contigs and make 'chr0'")
    scaf_options.add_argument(
        "-r",
        action='store_true',
        default=False,
        help="infer gap sizes. if not, all gaps are 100 bp")
    scaf_options.add_argument("-g",
                              metavar="INT",
                              type=int,
                              default=100,
                              help="minimum inferred gap size [100]")
    scaf_options.add_argument("-m",
                              metavar="INT",
                              type=int,
                              default=100000,
                              help="maximum inferred gap size [100000]")

    io_options = parser.add_argument_group("input/output options")
    io_options.add_argument("-o",
                            metavar="PATH",
                            type=str,
                            default="ragtag_output",
                            help="output directory [./ragtag_output]")
    io_options.add_argument("-w",
                            action='store_true',
                            default=False,
                            help="overwrite intermediate files")
    io_options.add_argument("-u",
                            action='store_true',
                            default=False,
                            help="add suffix to unplaced sequence headers")
    io_options.add_argument("--debug",
                            action='store_true',
                            default=False,
                            help=argparse.SUPPRESS)

    aln_options = parser.add_argument_group("mapping options")
    aln_options.add_argument("-t",
                             metavar="INT",
                             type=int,
                             default=1,
                             help="number of minimap2 threads [1]")
    aln_options.add_argument(
        "--aligner",
        metavar="PATH",
        type=str,
        default="minimap2",
        help="aligner executable ('nucmer' or 'minimap2') [minimap2]")
    mm2_default = "-x asm5"
    aln_options.add_argument(
        "--mm2-params",
        metavar="STR",
        type=str,
        default=mm2_default,
        help="space delimited minimap2 parameters ['%s']" % mm2_default)
    aln_options.add_argument(
        "--nucmer-params",
        metavar="STR",
        type=str,
        default="-l 100 -c 500",
        help="space delimted nucmer parameters ['-l 100 -c 500']")

    args = parser.parse_args()
    if not args.reference or not args.query:
        parser.print_help()
        print("\n** The reference and query FASTA files are required **")
        sys.exit()

    log("RagTag " + get_ragtag_version())
    log("CMD: ragtag.py scaffold " + " ".join(sys.argv[1:]))

    reference_file = os.path.abspath(args.reference)
    query_file = os.path.abspath(args.query)

    # Check that the reference/query file exists
    if not os.path.isfile(reference_file):
        raise ValueError("Could not find file: %s" % reference_file)

    if not os.path.isfile(query_file):
        raise ValueError("Could not find file: %s" % query_file)

    min_ulen = args.f
    keep_small_uniques = not args.remove_small
    merge_dist = args.d
    group_score_thresh = args.i
    loc_score_thresh = args.a
    orient_score_thresh = args.s
    make_chr0 = args.C
    infer_gaps = args.r
    num_threads = args.t

    # I/O options
    output_path = args.o
    if not os.path.isdir(output_path):
        os.mkdir(output_path)
    output_path = os.path.abspath(output_path) + "/"

    # Setup a log file for external RagTag scripts
    ragtag_log = output_path + "ragtag.scaffold.err"
    open(ragtag_log, "w").close()  # Wipe the log file

    overwrite_files = args.w
    remove_suffix = not args.u
    if remove_suffix:
        log("WARNING: Without '-u' invoked, some component/object AGP pairs might share the same ID. Some external programs/databases don't like this. To ensure valid AGP format, use '-u'."
            )

    # Gap options
    min_gap_size = args.g
    max_gap_size = args.m
    if min_gap_size < 1:
        raise ValueError("the minimum gap size must be positive")

    if max_gap_size < 1:
        raise ValueError("the maximum gap size must be positive")

    # Skip/exclude options
    query_blacklist = set()
    skip_file = args.j
    if skip_file:
        skip_file = os.path.abspath(args.j)
        with open(skip_file, "r") as f:
            for line in f:
                query_blacklist.add(line.rstrip())

    ref_blacklist = set()
    exclude_file = args.e
    if exclude_file:
        exclude_file = os.path.abspath(args.e)
        with open(exclude_file, "r") as f:
            for line in f:
                ref_blacklist.add(line.rstrip())

    # Get aligner arguments
    aligner_path = args.aligner
    aligner = aligner_path.split("/")[-1]
    if aligner.split("/")[-1] not in {'minimap2', 'nucmer'}:
        raise ValueError(
            "Must specify either 'minimap2' or 'nucmer' (PATHs allowed) with '--aligner'."
        )

    mm2_params = args.mm2_params
    nucmer_params = args.nucmer_params

    # Mapq filtering params
    min_mapq = args.q
    if aligner == "nucmer":
        min_mapq = 0

    # Add the number of mm2 threads if the mm2 params haven't been overridden.
    if mm2_params == mm2_default:
        mm2_params += " -t " + str(num_threads)

    # Debugging options
    debug_mode = args.debug
    debug_non_fltrd_file = output_path + "ragtag.scaffolds.debug.unfiltered.paf"
    debug_fltrd_file = output_path + "ragtag.scaffolds.debug.filtered.paf"
    debug_merged_file = output_path + "ragtag.scaffolds.debug.merged.paf"
    debug_query_info_file = output_path + "ragtag.scaffolds.debug.query.info.txt"

    # Align the query to the reference
    log("Mapping the query genome to the reference genome")
    if aligner == "minimap2":
        al = Minimap2Aligner(reference_file, [query_file],
                             aligner_path,
                             mm2_params,
                             output_path + "query_against_ref",
                             in_overwrite=overwrite_files)
    else:
        al = NucmerAligner(reference_file, [query_file],
                           aligner_path,
                           nucmer_params,
                           output_path + "query_against_ref",
                           in_overwrite=overwrite_files)
    al.run_aligner()

    # If alignments are from Nucmer, need to convert from delta to paf
    if aligner == "nucmer":
        cmd = ["ragtag_delta2paf.py", output_path + "query_against_ref.delta"]
        run_oae(cmd, output_path + "query_against_ref.paf", ragtag_log)

    # Read and organize the alignments
    log('Reading whole genome alignments')
    # ctg_alns = dict :: key=query header, value=ContigAlignment object
    ctg_alns = read_genome_alignments(output_path + "query_against_ref.paf",
                                      query_blacklist, ref_blacklist)

    # Filter the alignments
    if debug_mode:
        # create new empty copies of debugging output files
        open(debug_non_fltrd_file, "w").close()
        open(debug_fltrd_file, "w").close()
        open(debug_merged_file, "w").close()
        open(debug_query_info_file, "w").close()

    log("Filtering and merging alignments")
    for i in ctg_alns:

        # Write unfiltered alignments
        if debug_mode:
            with open(debug_non_fltrd_file, "a") as f:
                f.write(str(ctg_alns[i]))

        ctg_alns[i] = ctg_alns[i].unique_anchor_filter(
            min_ulen, keep_small=keep_small_uniques)
        if ctg_alns[i] is not None:
            ctg_alns[i] = ctg_alns[i].filter_mapq(min_mapq)
            if ctg_alns[i] is not None:

                # Write filtered alignments
                if debug_mode:
                    with open(debug_fltrd_file, "a") as f:
                        f.write(str(ctg_alns[i]))

                ctg_alns[i] = ctg_alns[i].merge_alns(merge_dist=merge_dist)

    # Remove query sequences which have no more qualifying alignments
    fltrd_ctg_alns = dict()
    for i in ctg_alns:
        if ctg_alns[i] is not None:

            # Write merged alignments and confidence scores
            if debug_mode:
                with open(debug_merged_file, "a") as f:
                    f.write(str(ctg_alns[i]))

                with open(debug_query_info_file, "a") as f:
                    f.write("\t".join([
                        i,
                        ctg_alns[i].best_ref_header,
                        str(ctg_alns[i].grouping_confidence),
                        str(ctg_alns[i].location_confidence),
                        str(ctg_alns[i].orientation_confidence),
                    ]) + "\n")

            if all([
                    ctg_alns[i].grouping_confidence > group_score_thresh,
                    ctg_alns[i].location_confidence > loc_score_thresh,
                    ctg_alns[i].orientation_confidence > orient_score_thresh
            ]):
                fltrd_ctg_alns[i] = ctg_alns[i]

    # For each reference sequence which has at least one assigned query sequence, get the list of
    # all query sequences assigned to that reference sequence.
    log("Ordering and orienting query sequences")
    mapped_ref_seqs = defaultdict(list)
    for i in fltrd_ctg_alns:
        best_ref = fltrd_ctg_alns[i].best_ref_header
        ref_start, ref_end = fltrd_ctg_alns[i].get_best_ref_pos()
        mapped_ref_seqs[best_ref].append((ref_start, ref_end, i))

    # Sort the query sequences for each reference sequence and define the padding sizes between adjacent query seqs
    g_inferred = 0
    g_small = 0
    g_large = 0
    pad_sizes = dict()
    gap_types = dict()
    for i in mapped_ref_seqs:
        # Remove contained contigs and sort the rest
        non_contained = remove_contained(mapped_ref_seqs[i])
        mapped_ref_seqs[i] = sorted(non_contained)
        if infer_gaps:
            # Infer the gap sizes between adjacent query seqs
            # Use the primary alignments to infer gap sizes
            pad_sizes[i] = []
            gap_types[i] = []
            for j in range(1, len(mapped_ref_seqs[i])):
                # Get info for the upstream alignment
                left_ctg = mapped_ref_seqs[i][j - 1][2]
                left_ref_start, left_ref_end = fltrd_ctg_alns[
                    left_ctg].get_best_ref_pos()
                left_qdist_start, left_qdist_end = fltrd_ctg_alns[
                    left_ctg].get_best_q_dist()

                # Get info for the downstream alignment
                right_ctg = mapped_ref_seqs[i][j][2]
                right_ref_start, right_ref_end = fltrd_ctg_alns[
                    right_ctg].get_best_ref_pos()
                right_qdist_start, right_qdist_end = fltrd_ctg_alns[
                    right_ctg].get_best_q_dist()

                # Get the inferred gap size
                i_gap_size = (right_ref_start - right_qdist_start) - (
                    left_ref_end + left_qdist_end)

                # Check if the gap size is too small or too large
                if i_gap_size <= min_gap_size:
                    pad_sizes[i].append(100)
                    gap_types[i].append("U")
                    g_small += 1
                elif i_gap_size > max_gap_size:
                    pad_sizes[i].append(100)
                    gap_types[i].append("U")
                    g_large += 1
                else:
                    pad_sizes[i].append(i_gap_size)
                    gap_types[i].append("N")
                    g_inferred += 1
        else:
            pad_sizes[i] = [100 for i in range(len(mapped_ref_seqs[i]) - 1)]
            gap_types[i] = ["U" for i in range(len(mapped_ref_seqs[i]) - 1)]

    if infer_gaps:
        log("%d inferred gap" % g_inferred)
        log("%d adjacent contig within min distance (%d) of each other" %
            (g_small, min_gap_size))
        log("%d inferred gaps exceed length threshold (%d)" %
            (g_large, max_gap_size))

    # Write the scaffolds
    log("Writing scaffolds")

    # Write the intermediate output file in AGP v2.1 format
    log("Writing: " + output_path + "ragtag.scaffolds.agp")
    write_orderings(output_path + "ragtag.scaffolds.agp",
                    output_path + "ragtag.confidence.txt", query_file,
                    mapped_ref_seqs, fltrd_ctg_alns, pad_sizes, gap_types,
                    make_chr0, True, not remove_suffix)

    # Build a FASTA from the AGP
    cmd = [
        "ragtag_agp2fasta.py", output_path + "ragtag.scaffolds.agp", query_file
    ]
    run_oae(cmd, output_path + "ragtag.scaffolds.fasta", ragtag_log)

    # Calculate the stats
    cmd = [
        "ragtag_stats.py", output_path + "ragtag.scaffolds.agp",
        output_path + "ragtag.confidence.txt"
    ]
    run_oae(cmd, output_path + "ragtag.scaffolds.stats", ragtag_log)

    log("Goodbye")
Exemple #13
0
def main():
    description = "Scaffold merging: derive a consensus scaffolding solution by reconciling distinct scaffoldings of " \
                  "'asm.fa'"
    parser = argparse.ArgumentParser(description=description, usage="ragtag.py merge <asm.fa> <scf1.agp> <scf2.agp> [...]")
    parser.add_argument("components", metavar="<asm.fasta>", nargs='?', default="", type=str, help="assembly fasta file (uncompressed or bgzipped)")
    parser.add_argument("agps", metavar="<scf1.agp> <scf2.agp> [...]", nargs='*', default=[], type=str, help="scaffolding AGP files")

    merge_options = parser.add_argument_group("merging options")
    merge_options.add_argument("-f", metavar="FILE", default="", type=str, help="CSV list of (AGP file,weight) [null]")
    merge_options.add_argument("-j", metavar="<skip.txt>", type=str, default="", help="list of query headers to leave unplaced [null]")
    merge_options.add_argument("-l", metavar="INT", default=100000, type=int, help="minimum assembly sequence length [100000]")
    merge_options.add_argument("-e", metavar="FLOAT", default=0.0, type=float, help="minimum edge weight. NA if using Hi-C [0.0]")
    merge_options.add_argument("--gap-func", metavar="STR", default="min", type=str, help="function for merging gap lengths {'min', 'max', or 'mean'} [min]")

    io_options = parser.add_argument_group("input/output options")
    io_options.add_argument("-o", metavar="PATH", type=str, default="ragtag_output", help="output directory [./ragtag_output]")
    io_options.add_argument("-w", action='store_true', default=False, help="overwrite intermediate files")
    io_options.add_argument("-u", action='store_true', default=False, help="add suffix to unplaced sequence headers")
    io_options.add_argument("--debug", action='store_true', default=False, help=argparse.SUPPRESS)

    hic_options = parser.add_argument_group("Hi-C options")
    hic_options.add_argument("-b", metavar="FILE", default="", type=str, help="Hi-C alignments in BAM format, sorted by read name [null]")
    hic_options.add_argument("-r", metavar="STR", default="GATC", type=str, help="CSV list of restriction enzymes/sites or 'DNase' [GATC]")
    hic_options.add_argument("-p", metavar="FLOAT", default=1.0, type=float, help="portion of the sequence termini to consider for links [1.0]")
    hic_options.add_argument("--list-enzymes", action='store_true', default=False, help="list all available restriction enzymes/sites")

    args = parser.parse_args()

    # Print a restriction enzyme help message if requested
    if args.list_enzymes:
        RestrictionEnzymes.get_info()
        sys.exit(0)

    if not args.components:
        parser.print_help()
        sys.exit("\n** The assembly FASTA file is required **")

    if not args.agps and not args.f:
        parser.print_help()
        sys.exit("\n** At least two AGP files are required **")

    log("VERSION", "RagTag " + get_ragtag_version())
    log("WARNING", "This is a beta version of `ragtag merge`")
    log("CMD", "ragtag.py merge " + " ".join(sys.argv[1:]))

    # Check that the components FASTA file exists
    comp_fname = args.components
    if not os.path.isfile(comp_fname):
        raise ValueError("Could not find file: %s" % comp_fname)

    # Optional arguments
    agp_fofn = args.f
    hic_bam_fname = args.b
    re_string = args.r
    portion = args.p

    # Set the minimum component sequence length
    min_comp_len = args.l
    if min_comp_len < 0:
        min_comp_len = 0

    # Set the minimum edge weight
    min_edge_weight = args.e
    if min_edge_weight < 0:
        min_edge_weight = 0

    # Set the gap merging function options
    gap_func = args.gap_func.upper()
    if gap_func not in {"MIN", "MAX", "MEAN"}:
        raise ValueError("Gap merging function must be either 'min', 'max', or 'mean'. Got: {}".format(args.gap_func))

    # Debugging options
    debug_mode = args.debug

    # I/O options
    output_path = args.o
    if not os.path.isdir(output_path):
        os.mkdir(output_path)
    output_path = os.path.abspath(output_path) + "/"
    file_prefix = "ragtag.merge"

    overwrite_files = args.w
    add_suffix = args.u
    if not add_suffix:
        log("WARNING", "Without '-u' invoked, some component/object AGP pairs might share the same ID. Some external programs/databases don't like this. To ensure valid AGP format, use '-u'.")

    # get the set of contigs to skip
    comp_exclusion_set = set()
    skip_fname = args.j
    if skip_fname:
        skip_fname = os.path.abspath(skip_fname)
        with open(skip_fname, "r") as f:
            for line in f:
                comp_exclusion_set.add(line.rstrip().split()[0])

    # Setup a file for general logging
    merge_log = output_path + file_prefix + ".err"
    open(merge_log, "w").close()  # Wipe the log file

    # Process the AGP files
    agp_list = [os.path.abspath(i) for i in args.agps]
    weight_list = [1 for _ in range(len(agp_list))]

    # Check for file of AGPs and weights
    if agp_fofn:
        agp_list, weight_list = [], []
        with open(agp_fofn, "r") as f:
            for line in f:
                fields = line.rstrip().split(",")
                agp_list.append(fields[0])
                weight_list.append(float(fields[1]))

    if len(agp_list) < 2:
        raise ValueError("At least two AGP files are required for merging")

    # Build the graph and filter nodes by sequence length
    log("INFO", "Building the scaffold graph from the AGP files")
    agp_multi_sg = AGPMultiScaffoldGraph(comp_fname)
    agp_multi_sg.add_agps(agp_list, in_weights=weight_list, exclusion_set=comp_exclusion_set)
    if min_comp_len:
        agp_multi_sg.filter_by_seq_len(min_comp_len)
    if debug_mode:
        nx.readwrite.gml.write_gml(agp_multi_sg.graph, output_path + "ragtag.merge.msg.gml")

    # Merge the SAG
    log("INFO", "Merging the scaffold graph")
    agp_sg = agp_multi_sg.merge()

    # Check if we are using Hi-C links to weight the graph.
    if hic_bam_fname:
        log("INFO", "Weighting the scaffold graph with Hi-C links")
        if not comp_fname or not re_string:
            raise RuntimeError("Hi-C requires alignments (-b) assembly sequences (-a) and restriction sites (-r)")

        cmd = [
            "ragtag_create_links.py",
            "-a", comp_fname,
            "-b", hic_bam_fname,
            "-r", re_string,
            "-p", str(portion)
        ]

        out_links_fname = output_path + file_prefix + ".links"
        if os.path.isfile(out_links_fname):
            if not overwrite_files:
                log("INFO", "Retaining pre-existing file: " + out_links_fname)
            else:
                run_oae(cmd, out_links_fname, merge_log)
        else:
            run_oae(cmd, out_links_fname, merge_log)

        hic_sg = build_hic_graph(out_links_fname, comp_fname)
        agp_sg = agp_sg.steal_weights_from(hic_sg)

    # Filter by edge weight
    if min_edge_weight and not hic_bam_fname:
        agp_sg.filter_by_weight(min_edge_weight)

    if debug_mode:
        agp_sg.connect_and_write_gml(output_path + file_prefix + ".sg.gml")

    # Compute a solution to the ScaffoldGraph
    log("INFO", "Computing a scaffolding solution")
    cover_graph = get_maximal_matching(agp_sg)
    if debug_mode:
        tmp_cover_graph = nx.Graph()
        for u, v in cover_graph.edges:
            tmp_cover_graph.add_edge(u, v)
        nx.readwrite.gml.write_gml(tmp_cover_graph, output_path + file_prefix + ".covergraph.gml")

    # Write the scaffolding output to an AGP file
    log("INFO", "Writing results")
    write_agp_solution(cover_graph, agp_sg, output_path + file_prefix + ".agp", gap_func=gap_func, add_suffix_to_unplaced=add_suffix)

    # Generate a FASTA file corresponding to the AGP
    cmd = [
        "ragtag_agp2fa.py",
        output_path + file_prefix + ".agp",
        comp_fname
    ]
    run_oae(cmd, output_path + file_prefix + ".fasta", merge_log)

    log("INFO", "Goodbye")
def main():
    description = """  
    """
    parser = argparse.ArgumentParser(
        description="Quantify links from a Hi-C BAM file.",
        usage=
        "ragtag_create_links.py -c components.fasta -b <hic.bam> -r <RE_site>")
    parser.add_argument("-a",
                        metavar="FILE",
                        default="",
                        type=str,
                        help="assembly fasta file [null]")
    parser.add_argument(
        "-b",
        metavar="FILE",
        default="",
        type=str,
        help="Hi-C alignments in BAM format, sorted by read name [null]")
    parser.add_argument("-r",
                        metavar="STR",
                        default="GATC",
                        type=str,
                        help="CSV list of restriction sites or 'DNase' [GATC]")
    parser.add_argument(
        "-p",
        metavar="FLOAT",
        default=1.0,
        type=float,
        help="portion of the sequence termini to consider for links [1.0]")
    parser.add_argument("--list-enzymes",
                        action='store_true',
                        default=False,
                        help="list all available restriction enzymes/sites")

    args = parser.parse_args()

    # Print a restriction enzyme help message if requested
    if args.list_enzymes:
        RestrictionEnzymes.get_info()
        sys.exit()

    # Continue with normal functionality if no restriction enzyme help message is requested
    if not args.a or not args.b or not args.r:
        parser.print_help()
        sys.exit()

    # Set the terminus portion
    portion = args.p
    if not 1 >= portion > 0:
        raise ValueError(
            "portion must be between 0 (exclusive) and 1 (inclusive)")

    asm_file = os.path.abspath(args.a)
    bam_file = os.path.abspath(args.b)

    dnase_mode = False
    re_string = args.r.upper()
    if "DNASE" in re_string:
        dnase_mode = True
        log("Running in DNase mode.")

    re_set = set()
    if not dnase_mode:
        re_set = set(filter(None, args.r.split(",")))
        if not re_set:
            raise ValueError(
                "At least one restriction enzyme/site is needed (-r) if not using 'DNase'."
            )

    # Store the sequence lengths
    asm_lens = dict()
    fai = pysam.FastaFile(asm_file)
    for ref in fai.references:
        asm_lens[ref] = fai.get_reference_length(ref)
    fai.close()

    # Get the left and right cutoff positions for each sequence
    l_cutoffs = dict()
    r_cutoffs = dict()
    for ref in asm_lens:
        l = asm_lens[ref] // 2
        r = asm_lens[ref] - l

        l_cutoffs[ref] = round(l * portion)
        r_cutoffs[ref] = asm_lens[ref] - round(r * portion)

    # Get the raw Hi-C links
    log("Computing raw Hi-C links from: {}".format(bam_file))
    raw_links = count_links(bam_file, l_cutoffs, r_cutoffs)

    # Normalize the Hi-C links
    l_norm_factors = l_cutoffs
    r_norm_factors = r_cutoffs

    # Normalize by the number of restriction sites if not using DNase
    if not dnase_mode:
        l_norm_factors = dict()
        r_norm_factors = dict()

        # Set the restriction enzymes
        RE = RestrictionEnzymes(re_set)
        log("Using the following restriction sites:\n{}".format(str(RE)))

        log("Counting restriction sites")
        rfm = RestrictionFragmentMap(asm_file, RE)

        # Get the number of sites for each contig terminus (l/b and r/e)
        for ref in l_cutoffs:
            l_norm_factors[ref] = rfm.count_sites_lte(ref, l_cutoffs[ref])
            r_norm_factors[ref] = rfm.count_sites_gt(ref, r_cutoffs[ref])

    log("Normalizing raw Hi-C links")
    norm_links = normalize_links(raw_links, l_norm_factors, r_norm_factors)
    write_links(raw_links, norm_links)
Exemple #15
0
def main():
    description = "Homology-based assembly patching: Make continuous joins and fill gaps " \
                  "in 'target.fa' using sequences from 'query.fa'"

    parser = argparse.ArgumentParser(description=description, usage="ragtag.py patch <target.fa> <query.fa>")

    parser.add_argument("reference", metavar="<target.fa>", nargs='?', default="", type=str, help="target fasta file (uncompressed or bgzipped)")
    parser.add_argument("query", metavar="<query.fa>", nargs='?', default="", type=str, help="query fasta file (uncompressed or bgzipped)")

    patch_options = parser.add_argument_group("patching")
    patch_options.add_argument("-e", metavar="<exclude.txt>", type=str, default="", help="list of target sequences to ignore [null]")
    patch_options.add_argument("-j", metavar="<skip.txt>", type=str, default="", help="list of query sequences to ignore [null]")
    patch_options.add_argument("-f", metavar="INT", type=int, default=1000, help="minimum unique alignment length [1000]")
    patch_options.add_argument("--remove-small", action="store_true", default=False, help="remove unique alignments shorter than '-f'")
    patch_options.add_argument("-q", metavar="INT", type=int, default=10, help="minimum mapq (NA for Nucmer alignments) [10]")
    patch_options.add_argument("-d", metavar="INT", type=int, default=100000, help="maximum alignment merge distance [100000]")
    patch_options.add_argument("-s", metavar="INT", type=int, default=50000, help="minimum merged alignment length [50000]")
    patch_options.add_argument("-i", metavar="FLOAT", type=float, default=0.05, help="maximum merged alignment distance from sequence terminus. fraction of the sequence length if < 1 [0.05]")
    patch_options.add_argument("--fill-only", action="store_true", default=False, help="only fill existing target gaps. do not join target sequences")
    patch_options.add_argument("--join-only", action="store_true", default=False, help="only join and patch target sequences. do not fill existing gaps")

    io_options = parser.add_argument_group("input/output options")
    io_options.add_argument("-o", metavar="PATH", type=str, default="ragtag_output", help="output directory [./ragtag_output]")
    io_options.add_argument("-w", action='store_true', default=False, help="overwrite intermediate files")
    io_options.add_argument("-u", action='store_true', default=False, help="add suffix to unplaced sequence headers")
    io_options.add_argument("--debug", action='store_true', default=False, help=argparse.SUPPRESS)

    aln_options = parser.add_argument_group("mapping options")
    aln_options.add_argument("-t", metavar="INT", type=int, default=1, help="number of minimap2/unimap threads [1]")
    aln_options.add_argument("--aligner", metavar="PATH", type=str, default="nucmer", help="aligner executable ('nucmer' (recommended), 'unimap' or 'minimap2') [nucmer]")
    mm2_default = "-x asm5"
    aln_options.add_argument("--mm2-params", metavar="STR", type=str, default=mm2_default, help="space delimited minimap2 parameters (overrides '-t') ['%s']" % mm2_default)
    aln_options.add_argument("--unimap-params", metavar="STR", type=str, default=mm2_default, help="space delimited unimap parameters (overrides '-t') ['%s']" % mm2_default)
    aln_options.add_argument("--nucmer-params", metavar="STR", type=str, default="--maxmatch -l 100 -c 500", help="space delimted nucmer parameters ['--maxmatch -l 100 -c 500']")

    args = parser.parse_args()
    if not args.reference or not args.query:
        parser.print_help()
        sys.exit("\n** The target and query FASTA files are required **")

    log("VERSION", "RagTag " + get_ragtag_version())
    log("WARNING", "This is a beta version of `ragtag patch`")
    log("CMD", "ragtag.py patch " + " ".join(sys.argv[1:]))

    reference_fn = os.path.abspath(args.reference)
    query_fn = os.path.abspath(args.query)

    # Check that the reference/query file exists
    if not os.path.isfile(reference_fn):
        raise FileNotFoundError("Could not find file: %s" % reference_fn)

    if not os.path.isfile(query_fn):
        raise FileNotFoundError("Could not find file: %s" % query_fn)

    # Alignment processing parameters
    min_ulen = args.f
    keep_small_uniques = not args.remove_small
    merge_dist = args.d
    num_threads = args.t

    aligner_path = args.aligner
    aligner = aligner_path.split("/")[-1]
    if aligner.split("/")[-1] not in {'minimap2', 'unimap', 'nucmer'}:
        raise ValueError("Must specify either 'minimap2', 'unimap', or 'nucmer' (PATHs allowed) with '--aligner'.")

    mm2_params = args.mm2_params
    unimap_params = args.unimap_params
    nucmer_params = args.nucmer_params

    # Mapq filtering parameters
    min_mapq = args.q
    if aligner == "nucmer":
        min_mapq = 0

    # Add the number of mm2/unimap threads if the mm2 params haven't been overridden.
    if mm2_params == mm2_default:
        mm2_params += " -t " + str(num_threads)
    if unimap_params == mm2_default:
        unimap_params += " -t " + str(num_threads)

    # Set reference/query sequences to ignore
    ref_blacklist = set()
    exclude_file = args.e
    if exclude_file:
        exclude_file = os.path.abspath(args.e)
        with open(exclude_file, "r") as f:
            for line in f:
                ref_blacklist.add(line.rstrip())

    query_blacklist = set()
    skip_file = args.j
    if skip_file:
        skip_file = os.path.abspath(skip_file)
        with open(skip_file, "r") as f:
            for line in f:
                query_blacklist.add(line.rstrip())

    # Supporting alignment parameters
    min_sup_aln_len = args.s
    max_term_dist = args.i
    if max_term_dist <= 0:
        raise ValueError("-i must be a positive nonzero number.")

    # Task options
    fill_only = args.fill_only
    join_only = args.join_only
    if fill_only and join_only:
        raise ValueError("'--fill-only' and '--join-only' cannot be used together")

    # I/O parameters
    add_suffix = args.u
    if not add_suffix:
        log("WARNING", "Without '-u' invoked, some component/object AGP pairs might share the same ID. Some external programs/databases don't like this. To ensure valid AGP format, use '-u'.")

    overwrite_files = args.w
    output_path = args.o
    if not os.path.isdir(output_path):
        os.mkdir(output_path)
    output_path = os.path.abspath(output_path) + "/"
    file_prefix = "ragtag.patch"

    # Setup a log file for external RagTag scripts
    ragtag_log = output_path + file_prefix + ".err"
    open(ragtag_log, "w").close()  # Wipe the log file

    # Debugging options
    debug_mode = args.debug

    # Break the reference assembly at gaps
    cmd = [
        "ragtag_splitasm.py",
        "-o",
        output_path + file_prefix + ".ctg.agp",
        reference_fn
    ]
    reference_ctg_fn = output_path + file_prefix + ".ctg.fasta"
    if os.path.isfile(reference_ctg_fn):
        if overwrite_files:
            log("INFO", "Overwriting pre-existing file: " + reference_ctg_fn)
            run_oae(cmd, reference_ctg_fn, ragtag_log)
        else:
            log("INFO", "Retaining pre-existing file: " + reference_ctg_fn)
    else:
        run_oae(cmd, reference_ctg_fn, ragtag_log)

    # Rename the query sequences
    cmd = [
        "ragtag_rename.py",
        query_fn,
        "-p",
        "qseq",
        "-o",
        output_path + file_prefix + ".rename.agp",
    ]
    query_rename_fn = output_path + file_prefix + ".rename.fasta"
    if os.path.isfile(query_rename_fn):
        if overwrite_files:
            log("INFO", "Overwriting pre-existing file: " + query_rename_fn)
            run_oae(cmd, query_rename_fn, ragtag_log)
        else:
            log("INFO", "Retaining pre-existing file: " + query_rename_fn)
    else:
        run_oae(cmd, query_rename_fn, ragtag_log)

    # Combine the reference contigs and query sequences to make a components fasta file
    components_fn = output_path + file_prefix + ".comps.fasta"
    if os.path.isfile(components_fn):
        if overwrite_files:
            log("INFO", "Overwriting pre-existing file: " + components_fn)
            write_comps = True
        else:
            log("INFO", "Retaining pre-existing file: " + components_fn)
            write_comps = False
    else:
        write_comps = True

    if write_comps:
        log("INFO", "Writing: " + components_fn)
        ref_fai = pysam.FastaFile(reference_ctg_fn)
        query_fai = pysam.FastaFile(query_rename_fn)
        with open(components_fn, "w") as f:
            for ref in ref_fai.references:
                f.write(">" + ref + "\n")
                f.write(ref_fai.fetch(ref) + "\n")

            for query in query_fai.references:
                f.write(">" + query + "\n")
                f.write(query_fai.fetch(query) + "\n")

    # Map the query assembly to the reference contigs
    log("INFO", "Mapping the query genome to the target genome")
    if aligner == "minimap2":
        al = Minimap2Aligner(reference_ctg_fn, [query_rename_fn], aligner_path, mm2_params, output_path + file_prefix + ".asm", in_overwrite=overwrite_files)
    elif aligner == "unimap":
        al = UnimapAligner(reference_ctg_fn, [query_rename_fn], aligner_path, unimap_params, output_path + file_prefix + ".asm", in_overwrite=overwrite_files)
    else:
        al = NucmerAligner(reference_ctg_fn, [query_rename_fn], aligner_path, nucmer_params, output_path + file_prefix + ".asm", in_overwrite=overwrite_files)
    al.run_aligner()

    # If alignments are from Nucmer, need to convert from delta to paf
    if aligner == "nucmer":
        cmd = ["ragtag_delta2paf.py", output_path + file_prefix + ".asm.delta"]
        run_oae(cmd, output_path + file_prefix + ".asm.paf", ragtag_log)

    # Read and organize the alignments
    log("INFO", "Reading whole genome alignments")
    # ctg_alns: query header -> ContigAlignment object
    ctg_alns = read_genome_alignments(output_path + file_prefix + ".asm.paf", query_blacklist, ref_blacklist)

    # Check if any alignments are left
    if not ctg_alns:
        raise RuntimeError("There are no alignments. Check '{}'.".format(output_path + file_prefix + ".asm.paf"))

    # Filter the alignments
    unfiltered_strings, filtered_strings, merged_strings, useful_strings = [], [], [], []
    log("INFO", "Filtering and merging alignments")
    fltrd_ctg_alns = dict()
    for i in ctg_alns:
        # Unique anchor filtering
        unfiltered_strings.append(str(ctg_alns[i]))
        ctg_alns[i] = ctg_alns[i].unique_anchor_filter(min_ulen, keep_small=keep_small_uniques)

        # mapq filtering
        if ctg_alns[i] is not None:
            ctg_alns[i] = ctg_alns[i].filter_mapq(min_mapq)
            if ctg_alns[i] is not None:
                filtered_strings.append(str(ctg_alns[i]))

                # alignment merging
                ctg_alns[i] = ctg_alns[i].merge_alns(merge_dist=merge_dist, careful_merge=True)
                if ctg_alns[i] is not None:
                    merged_strings.append(str(ctg_alns[i]))

                    # Length filtering
                    ctg_alns[i] = ctg_alns[i].filter_lengths(min_sup_aln_len)
                    if ctg_alns[i] is not None:
                        # terminal filtering
                        ctg_alns[i] = ctg_alns[i].keep_terminals(max_term_dist)

                        # Save the remaining useful alignments
                        if ctg_alns[i] is not None and ctg_alns[i].num_refs > 1 and not ctg_alns[i].has_internal_ref_cuttings(max_term_dist):
                            useful_strings.append(str(ctg_alns[i]))
                            fltrd_ctg_alns[i] = ctg_alns[i]

    # Write debugging files
    debug_non_fltrd_file = output_path + file_prefix + ".debug.unfiltered.paf"
    debug_fltrd_file = output_path + file_prefix + ".debug.filtered.paf"
    debug_merged_file = output_path + file_prefix + ".debug.merged.paf"
    debug_useful_file = output_path + file_prefix + ".debug.useful.paf"
    if debug_mode:
        with open(debug_non_fltrd_file, "w") as f:
            f.write("".join(unfiltered_strings))

        with open(debug_fltrd_file, "w") as f:
            f.write("".join(filtered_strings))

        with open(debug_merged_file, "w") as f:
            f.write("".join(merged_strings))

        with open(debug_useful_file, "w") as f:
            f.write("".join(useful_strings))

    # Make a Scaffold Graph encoding known reference contigs adjacencies
    log("INFO", "Building a scaffold graph from the contig AGP file")
    agp_multi_sg = AGPMultiScaffoldGraph(reference_ctg_fn)
    agp_multi_sg.add_agps([output_path + file_prefix + ".ctg.agp"])
    agp_sg = agp_multi_sg.merge()

    # As a hack, go through the AGP sg and make the required directed scaffold graph
    agp_psg = PatchScaffoldGraph(components_fn)
    for u, v in agp_sg.edges:
        aln = Alignment(
            u,
            v,
            "",
            agp_sg[u][v]["gap_size"][0],
            0,
            agp_sg[u][v]["gap_size"][0],
            0,
            is_gap=True
        )
        agp_psg.add_edge(u, v, aln)

    # Make a second directed scaffold graph from the alignments
    log("INFO", "Building a scaffold graph from the target/query mappings")
    aln_psg = build_aln_scaffold_graph(fltrd_ctg_alns, components_fn, max_term_dist)

    # Add edges for unfilled gaps
    for u, v in agp_psg.edges:
        if not aln_psg.has_edge(u, v):
            aln_psg.add_edge(u, v, agp_psg[u][v]["alignment"])

    # Remove known false edges
    for u, v in agp_psg.edges:
        for neighbor in list(aln_psg.neighbors(u)):
            if neighbor != v:
                aln_psg.remove_edge(u, neighbor)
                aln_psg.remove_edge(neighbor, u)

        for neighbor in list(aln_psg.neighbors(v)):
            if neighbor != u:
                aln_psg.remove_edge(neighbor, v)
                aln_psg.remove_edge(v, neighbor)

    # Adjust the graph depending on if only fills or joins are requested
    if fill_only:
        psg = PatchScaffoldGraph(components_fn)
        for u, v in agp_psg.edges:
            psg.add_edge(u, v, aln_psg[u][v]["alignment"])
            psg.add_edge(v, u, aln_psg[v][u]["alignment"])
        aln_psg = psg

    if join_only:
        for u, v in agp_psg.edges:
            aln_psg[u][v]["alignment"] = agp_psg[u][v]["alignment"]
            aln_psg[v][u]["alignment"] = agp_psg[v][u]["alignment"]

    if debug_mode:
        aln_psg.write_gml(output_path + file_prefix + ".debug.sg.gml")

    # Compute a matching solution for the graph
    log("INFO", "Computing a matching solution to the scaffold graph")
    match_psg = aln_psg.max_weight_matching()

    if debug_mode:
        match_psg.write_gml(output_path + file_prefix + ".debug.matching.gml")

    # Write the output in AGP format
    log("INFO", "Writing output files")
    match_psg.write_agp(output_path + file_prefix + ".agp", output_path + file_prefix + ".ctg.fasta", add_suffix_to_unplaced=add_suffix)

    # Write the output in fasta format
    cmd = [
        "ragtag_agp2fa.py",
        output_path + file_prefix + ".agp",
        components_fn
    ]
    run_oae(cmd, output_path + file_prefix + ".fasta", ragtag_log)

    log("INFO", "Goodbye")