コード例 #1
0
ファイル: ragtag_correct.py プロジェクト: tclin422/RagTag
def write_breaks(out_file, query_file, ctg_breaks, overwrite, remove_suffix):
    """ Write the intermediate file for contig breaks in AGP v2.1 format."""
    # Check if the output file already exists
    if os.path.isfile(out_file):
        if not overwrite:
            log("Retaining pre-existing file: " + out_file)
            return

        else:
            log("Overwriting pre-existing file: " + out_file)

    fai = pysam.FastaFile(query_file)
    all_q_seqs = sorted(fai.references)
    agp = AGPFile(out_file, mode="w")

    agp.add_pragma()
    agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version()))

    for q in all_q_seqs:

        # Check if this sequence was broken during misassembly correction
        if q not in ctg_breaks:

            # Add suffix to query header, unless otherwise requested
            unchanged_comp_header = q
            if not remove_suffix:
                unchanged_comp_header = q + ":0" + "-" + str(
                    fai.get_reference_length(q)) + "(+)"

            agp.add_seq_line(q, "1", str(fai.get_reference_length(q)), "1",
                             "W", unchanged_comp_header, "1",
                             str(fai.get_reference_length(q)), "+")
        else:  # This query sequence was broken
            pid = 1
            sorted_breaks = sorted(ctg_breaks[q])
            start = 0
            for i in sorted_breaks:
                agp.add_seq_line(q, str(start + 1), str(i), str(pid), "W",
                                 q + ":" + str(start) + "-" + str(i) + "(+)",
                                 "1", str(i - start), "+")
                start = i
                pid += 1

            # Add one line for the last interval
            agp.add_seq_line(
                q, str(start + 1), str(fai.get_reference_length(q)), str(pid),
                "W", q + ":" + str(start) + "-" +
                str(fai.get_reference_length(q)) + "(+)", "1",
                str(fai.get_reference_length(q) - start), "+")

    log("Writing: " + out_file)
    agp.write()
    fai.close()
コード例 #2
0
ファイル: ragtag.py プロジェクト: wangdi2014/RagTag
def main():
    VERSION = get_ragtag_version()
    CITATION = """
Alonge, Michael, et al. "RaGOO: fast and accurate reference-guided scaffolding of draft genomes."
Genome biology 20.1 (2019): 1-17.
    """

    description = """
RagTag: Reference-guided scaffolding and misassembly correction.
Version: %s

usage: ragtag.py <command> [options]
    
    commands:
      correct         correct contig misassemblies 
      scaffold        scaffold contigs
      updategff       update gff intervals

    options:
      -c, --citation  
      -v, --version""" % VERSION

    arg_len = len(sys.argv)
    if arg_len == 1:
        print(description)

    if arg_len > 1:
        cmd = sys.argv[1]

        if cmd == "-h" or cmd == "--help":
            print(description)

        elif cmd == "-v" or cmd == "--version":
            print(VERSION)

        elif cmd == "-c" or cmd == "--citation":
            print(CITATION)

        elif cmd == "scaffold":
            subcmd = ["ragtag_scaffold.py"] + sys.argv[2:]
            subprocess.call(subcmd)

        elif cmd == "correct":
            subcmd = ["ragtag_correct.py"] + sys.argv[2:]
            subprocess.call(subcmd)

        elif cmd == "updategff":
            subcmd = ["ragtag_update_gff.py"] + sys.argv[2:]
            subprocess.call(subcmd)

        else:
            print(description)
            print("\n** unrecognized command: %s" % cmd)
コード例 #3
0
def main():
    parser = argparse.ArgumentParser(
        description="Update gff intervals given a RagTag AGP file",
        usage="ragtag.py updategff [-c] <genes.gff> <ragtag.agp>")
    parser.add_argument("gff",
                        nargs='?',
                        default="",
                        metavar="<genes.gff>",
                        type=str,
                        help="gff file")
    parser.add_argument("agp",
                        nargs='?',
                        default="",
                        metavar="<ragtag.*.agp>",
                        type=str,
                        help="agp file")
    parser.add_argument(
        "-c",
        action="store_true",
        default=False,
        help="update for misassembly correction (ragtag.correction.agp)")

    args = parser.parse_args()

    if not args.gff or not args.agp:
        parser.print_help()
        sys.exit()

    log("VERSION", "RagTag " + get_ragtag_version())
    log("CMD", "ragtag.py updategff " + " ".join(sys.argv[1:]))

    gff_file = os.path.abspath(args.gff)
    agp_file = os.path.abspath(args.agp)
    is_sub = args.c

    if is_sub:
        sub_update(gff_file, agp_file)
    else:
        sup_update(gff_file, agp_file)

    log("INFO", "Goodbye")
コード例 #4
0
ファイル: setup.py プロジェクト: zzsunday/RagTag
#!/usr/bin/env python

from setuptools import setup
import glob

from ragtag_utilities.utilities import get_ragtag_version

with open("README.md", "r") as fh:
    long_description = fh.read()

scripts = glob.glob("*.p*")

version = get_ragtag_version()[1:]

setup(name='RagTag',
      version=version,
      author='Michael Alonge',
      author_email='*****@*****.**',
      description='Fast reference-guided genome assembly scaffolding',
      long_description=long_description,
      long_description_content_type="text/markdown",
      url="https://github.com/malonge/RagTag",
      packages=['ragtag_utilities'],
      package_dir={'ragtag_utilities': 'ragtag_utilities/'},
      license="MIT",
      classifiers=[
          "Programming Language :: Python :: 3",
          "License :: OSI Approved :: MIT License",
      ],
      install_requires=[
          'intervaltree',
コード例 #5
0
ファイル: ragtag_correct.py プロジェクト: wook2014/RagTag
def main():
    parser = argparse.ArgumentParser(
        description='Reference-guided misassembly correction',
        usage="ragtag.py correct <reference.fa> <query.fa>")

    cor_options = parser.add_argument_group("correction options")
    cor_options.add_argument(
        "reference",
        metavar="<reference.fa>",
        nargs='?',
        default="",
        type=str,
        help="reference fasta file (can be uncompressed or bgzipped)")
    cor_options.add_argument(
        "query",
        metavar="<query.fa>",
        nargs='?',
        default="",
        type=str,
        help="query fasta file (can be uncompressed or bgzipped)")
    cor_options.add_argument("-f",
                             metavar="INT",
                             type=int,
                             default=1000,
                             help="minimum unique alignment length [1000]")
    cor_options.add_argument("--remove-small",
                             action="store_true",
                             default=False,
                             help="remove unique alignments shorter than -f")
    cor_options.add_argument(
        "-q",
        metavar="INT",
        type=int,
        default=10,
        help="minimum mapq (NA for Nucmer alignments) [10]")
    cor_options.add_argument("-d",
                             metavar="INT",
                             type=int,
                             default=100000,
                             help="alignment merge distance [100000]")
    cor_options.add_argument(
        "-b",
        metavar="INT",
        type=int,
        default=5000,
        help="minimum break distance from contig ends [5000]")
    cor_options.add_argument("-e",
                             metavar="<exclude.txt>",
                             type=str,
                             default="",
                             help="list of reference headers to ignore")
    cor_options.add_argument("-j",
                             metavar="<skip.txt>",
                             type=str,
                             default="",
                             help="list of query headers to leave uncorrected")
    cor_options.add_argument(
        "--inter",
        action="store_true",
        default=False,
        help="only break misassemblies between reference sequences")
    cor_options.add_argument(
        "--intra",
        action="store_true",
        default=False,
        help="only break misassemblies within reference sequences")
    cor_options.add_argument("--gff",
                             metavar="<features.gff>",
                             type=str,
                             default="",
                             help="don't break sequences within gff intervals")

    io_options = parser.add_argument_group("input/output options")
    io_options.add_argument("-o",
                            metavar="PATH",
                            type=str,
                            default="ragtag_output",
                            help="output directory [./ragtag_output]")
    io_options.add_argument("-w",
                            action='store_true',
                            default=False,
                            help="overwrite intermediate files")
    io_options.add_argument("-u",
                            action='store_true',
                            default=False,
                            help="add suffix to unaltered sequence headers")
    io_options.add_argument("--debug",
                            action='store_true',
                            default=False,
                            help=argparse.SUPPRESS)

    aln_options = parser.add_argument_group("mapping options")
    mm2_default = "-x asm5"
    aln_options.add_argument("-t",
                             metavar="INT",
                             type=int,
                             default=1,
                             help="number of minimap2 threads [1]")
    aln_options.add_argument(
        "--aligner",
        metavar="PATH",
        type=str,
        default="minimap2",
        help=
        "whole genome aligner executable ('nucmer' or 'minimap2') [minimap2]")
    aln_options.add_argument(
        "--mm2-params",
        metavar="STR",
        type=str,
        default=mm2_default,
        help="space delimited minimap2 whole genome alignment parameters ['%s']"
        % mm2_default)
    aln_options.add_argument(
        "--nucmer-params",
        metavar="STR",
        type=str,
        default="-l 100 -c 500",
        help=
        "space delimted nucmer whole genome alignment parameters ['-l 100 -c 500']"
    )

    val_options = parser.add_argument_group("validation options")
    val_options.add_argument(
        "--read-aligner",
        metavar="PATH",
        type=str,
        default="minimap2",
        help="read aligner executable (only 'minimap2' is allowed) [minimap2]")
    val_options.add_argument(
        "-R",
        metavar="<reads.fasta>",
        type=str,
        default="",
        help="validation reads. gzipped fastq or fasta allowed.")
    val_options.add_argument("-F",
                             metavar="<reads.fofn>",
                             type=str,
                             default="",
                             help="same as '-R', but a list of files.")
    val_options.add_argument(
        "-T",
        metavar="sr",
        type=str,
        default="",
        help=
        "read type. 'sr' and 'corr' accepted for short reads and error corrected long-reads, respectively."
    )
    val_options.add_argument("-v",
                             metavar="INT",
                             type=int,
                             default=10000,
                             help="coverage validation window size [10000]")
    val_options.add_argument(
        "--max-cov",
        metavar="INT",
        type=int,
        default=-1,
        help="break sequences at regions at or above this coverage level [AUTO]"
    )
    val_options.add_argument(
        "--min-cov",
        metavar="INT",
        type=int,
        default=-1,
        help="break sequences at regions at or below this coverage level [AUTO]"
    )
    val_options.add_argument(
        "-m", metavar="INT", type=int, default=1000, help=argparse.SUPPRESS
    )  # Merge breakpoints within this distance after validation

    args = parser.parse_args()

    if not args.reference or not args.query:
        parser.print_help()
        sys.exit()

    log("RagTag " + get_ragtag_version())
    log("CMD: " + " ".join(sys.argv))

    reference_file = os.path.abspath(args.reference)
    query_file = os.path.abspath(args.query)

    # Check that the reference/query file exists
    if not os.path.isfile(reference_file):
        raise ValueError("Could not find file: %s" % reference_file)

    if not os.path.isfile(query_file):
        raise ValueError("Could not find file: %s" % query_file)

    num_threads = args.t
    min_ulen = args.f
    keep_small_uniques = not args.remove_small
    merge_dist = args.d
    min_break_dist = args.m
    min_break_end_dist = args.b
    val_window_size = args.v

    # I/O options
    output_path = args.o
    if not os.path.isdir(output_path):
        os.mkdir(output_path)
    output_path = os.path.abspath(output_path) + "/"

    overwrite_files = args.w
    remove_suffix = not args.u
    if remove_suffix:
        log("WARNING: Without '-u' invoked, some component/object AGP pairs might share the same ID. Some external programs/databases don't like this. To ensure valid AGP format, use '-u'."
            )

    gff_file = args.gff
    if gff_file:
        gff_file = os.path.abspath(gff_file)

    # Skip/exclude options
    query_blacklist = set()
    skip_file = args.j
    if skip_file:
        skip_file = os.path.abspath(args.j)
        with open(skip_file, "r") as f:
            for line in f:
                query_blacklist.add(line.rstrip())

    ref_blacklist = set()
    exclude_file = args.e
    if exclude_file:
        exclude_file = os.path.abspath(args.e)
        with open(exclude_file, "r") as f:
            for line in f:
                ref_blacklist.add(line.rstrip())

    # Get aligner arguments
    genome_aligner_path = args.aligner
    genome_aligner = genome_aligner_path.split("/")[-1]
    if genome_aligner.split("/")[-1] not in {'minimap2', 'nucmer'}:
        raise ValueError(
            "Must specify either 'minimap2' or 'nucmer' (PATHs allowed) with '--aligner'."
        )

    mm2_params = args.mm2_params
    nucmer_params = args.nucmer_params

    # Mapq filtering params
    min_mapq = args.q
    if genome_aligner == "nucmer":
        min_mapq = 0

    # Add the number of mm2 threads if the mm2 params haven't been overridden.
    if mm2_params == mm2_default:
        mm2_params += " -t " + str(num_threads)

    # Check if intra/inter breaking is desired
    break_intra = True
    break_inter = True
    only_intra = args.intra
    only_inter = args.inter
    if only_intra and only_inter:
        raise ValueError(
            "Must speficity either '--inter' or '--intra', not both.")

    if only_intra:
        break_inter = False
    if only_inter:
        break_intra = False

    # read-alignment parameters
    val_reads = args.R
    val_reads_fofn = args.F
    val_reads_tech = args.T
    read_aligner_path = args.read_aligner
    read_aligner = read_aligner_path.split("/")[-1]
    if read_aligner != "minimap2":
        raise ValueError(
            "Only minimap2 can be used for read alignments. got: %s" %
            read_aligner)

    # If the genome aligner is minimap2, we can just use that path for read alignment
    if genome_aligner == 'minimap2':
        read_aligner_path = genome_aligner_path

    # Make sure that if -R or -F, -T has been specified.
    if val_reads or val_reads_fofn:
        if not val_reads_tech:
            raise ValueError("'-T' must be provided when using -R or -F.")

    # Make a list of read sequences.
    read_files = []
    if val_reads_fofn:
        with open(val_reads_fofn, "r") as f:
            for line in f:
                read_files.append(os.path.abspath(line.rstrip()))
    elif val_reads:
        read_files.append(os.path.abspath(val_reads))

    # Coverage thresholds
    max_cov = args.max_cov
    min_cov = args.min_cov

    if max_cov < 0:
        if max_cov != -1:
            raise ValueError("--max-cov must be >=0")

    if min_cov < 0:
        if min_cov != -1:
            raise ValueError("--min-cov must be >=0")

    # Debugging options
    debug_mode = args.debug
    debug_non_fltrd_file = output_path + "ragtag.correction.debug.unfiltered.paf"
    debug_fltrd_file = output_path + "ragtag.correction.debug.filtered.paf"
    debug_merged_file = output_path + "ragtag.correction.debug.merged.paf"
    debug_query_info_file = output_path + "ragtag.correction.debug.query.info.txt"

    # Align the query to the reference.
    log("Mapping the query genome to the reference genome")
    if genome_aligner == "minimap2":
        al = Minimap2Aligner(reference_file, [query_file],
                             genome_aligner_path,
                             mm2_params,
                             output_path + "c_query_against_ref",
                             in_overwrite=overwrite_files)
    else:
        al = NucmerAligner(reference_file, [query_file],
                           genome_aligner_path,
                           nucmer_params,
                           output_path + "c_query_against_ref",
                           in_overwrite=overwrite_files)
    al.run_aligner()

    # If alignments are from Nucmer, convert from delta to paf.
    if genome_aligner == "nucmer":
        cmd = [
            "ragtag_delta2paf.py", output_path + "c_query_against_ref.delta"
        ]
        run_o(
            cmd,
            output_path + "c_query_against_ref.paf",
        )

    # Read and organize the alignments.
    log('Reading whole genome alignments')
    # ctg_alns = dict :: key=query header, value=ContigAlignment object
    ctg_alns = read_genome_alignments(output_path + "c_query_against_ref.paf",
                                      query_blacklist, ref_blacklist)

    # Filter and merge the alignments.
    if debug_mode:
        # create new empty copies of debugging output files
        open(debug_non_fltrd_file, "w").close()
        open(debug_fltrd_file, "w").close()
        open(debug_merged_file, "w").close()
        open(debug_query_info_file, "w").close()

    log("Filtering and merging alignments")
    for i in ctg_alns:

        # Write unfiltered alignments
        if debug_mode:
            with open(debug_non_fltrd_file, "a") as f:
                f.write(str(ctg_alns[i]))

        ctg_alns[i] = ctg_alns[i].unique_anchor_filter(
            min_ulen, keep_small=keep_small_uniques)
        if ctg_alns[i] is not None:
            ctg_alns[i] = ctg_alns[i].filter_mapq(min_mapq)
            if ctg_alns[i] is not None:

                # Write filtered alignments
                if debug_mode:
                    with open(debug_fltrd_file, "a") as f:
                        f.write(str(ctg_alns[i]))

                ctg_alns[i] = ctg_alns[i].merge_alns(merge_dist=merge_dist)

    # Get the putative breakpoints for each query sequence, if any.
    ctg_breaks = dict()
    for i in ctg_alns:
        if ctg_alns[i] is not None:

            # Write merged alignments and confidence scores
            if debug_mode:
                with open(debug_merged_file, "a") as f:
                    f.write(str(ctg_alns[i]))

                with open(debug_query_info_file, "a") as f:
                    f.write("\t".join([
                        i,
                        ctg_alns[i].best_ref_header,
                        str(ctg_alns[i].grouping_confidence),
                        str(ctg_alns[i].location_confidence),
                        str(ctg_alns[i].orientation_confidence),
                    ]) + "\n")

            breaks = []
            intra_breaks, inter_breaks = ctg_alns[i].get_break_candidates(
                min_dist=min_break_end_dist)
            if break_intra:
                breaks = breaks + intra_breaks
            if break_inter:
                breaks = breaks + inter_breaks
            if breaks:
                ctg_breaks[i] = breaks

    # If desired, validate the putative breakpoints by observing read coverage.
    if read_files:
        log("Validating putative query breakpoints via read alignment.")
        log("Aligning reads to query sequences.")
        if not os.path.isfile(output_path + "c_reads_against_query.s.bam"):
            if val_reads_tech == "sr":
                al = Minimap2SAMAligner(query_file,
                                        read_files,
                                        read_aligner_path,
                                        "-ax sr -t " + str(num_threads),
                                        output_path + "c_reads_against_query",
                                        in_overwrite=overwrite_files)
            elif val_reads_tech == "corr":
                al = Minimap2SAMAligner(query_file,
                                        read_files,
                                        read_aligner_path,
                                        "-ax asm5 -t " + str(num_threads),
                                        output_path + "c_reads_against_query",
                                        in_overwrite=overwrite_files)
            else:
                raise ValueError("'-T' must be either 'sr' or 'corr'.")
            al.run_aligner()
        else:
            log("Retaining pre-existing read alignments: " + output_path +
                "c_reads_against_query.s.bam")

        # Compress, sort and index the alignments.
        log("Compressing, sorting, and indexing read alignments")
        run_samtools(output_path, num_threads, overwrite_files)

        # Validate the breakpoints
        log("Validating putative query breakpoints")

        # Give at least 10k/1k from ctg ends for coverage to accumulate for corr and sr, respectively.
        val_min_break_end_dist = min_break_end_dist
        if val_reads_tech == "corr":
            val_min_break_end_dist = max(10000, min_break_end_dist)
        if val_reads_tech == "sr":
            val_min_break_end_dist = max(1000, min_break_end_dist)

        # Validate the breakpoints
        ctg_breaks = validate_breaks(ctg_breaks,
                                     output_path,
                                     num_threads,
                                     overwrite_files,
                                     val_min_break_end_dist,
                                     max_cov,
                                     min_cov,
                                     window_size=val_window_size,
                                     clean_dist=min_break_dist,
                                     debug=debug_mode)

    # Check if we need to avoid gff intervals
    if gff_file:
        log("Avoiding breaks within GFF intervals")
        it = make_gff_interval_tree(gff_file)
        non_gff_breaks = dict()
        for ctg in ctg_breaks:
            new_breaks = []
            for i in ctg_breaks[ctg]:
                if it[ctg][i]:
                    log("Avoiding breaking %s at %d. This point intersects a feature in the gff file."
                        % (ctg, i))
                else:
                    new_breaks.append(i)
            if new_breaks:
                non_gff_breaks[ctg] = new_breaks
        ctg_breaks = non_gff_breaks

    # Write the summary of query sequence breaks in AGP format
    agp_file = output_path + "ragtag.correction.agp"
    write_breaks(agp_file, query_file, ctg_breaks, overwrite_files,
                 remove_suffix)

    # Write the scaffolds.
    log("Writing broken contigs")
    qf_name = query_file.split("/")[-1]
    qf_pref = qf_name[:qf_name.rfind(".")]
    cmd = ["ragtag_break_query.py", agp_file, query_file]
    run_o(cmd, output_path + qf_pref + ".corrected.fasta")

    log("Goodbye")
コード例 #6
0
ファイル: ScaffoldGraph.py プロジェクト: malonge/RagTag
    def write_agp(self, agp_fn, ref_fn, add_suffix_to_unplaced=False):
        """
        Write the AGP file implied by the scaffold graph
        :param agp_fn: AGP file name
        :param ref_fn: reference FASTA file name
        :param add_suffix_to_unplaced: add "_RagTag" to unscaffolded sequences
        """
        used_components = set()
        used_edges = set()
        obj_header_idx = -1

        agp = AGPFile(agp_fn, "w")
        agp.add_pragma()
        agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version()))

        while True:
            # Find a starting node
            from_node = None
            to_node = None
            cur_ref = None
            for u, v in sorted(self.edges):
                if (u, v) not in used_edges:
                    u_base = u[:-2]

                    u_degree = 0
                    if u_base + "_b" in self.nodes:
                        u_degree += self.graph.degree[u_base + "_b"]
                    if u_base + "_e" in self.nodes:
                        u_degree += self.graph.degree[u_base + "_e"]

                    assert u_degree in {2, 4}

                    # Check if we have found a starting target sequence
                    if u_degree == 2:
                        cur_ref = u_base
                        from_node = u
                        to_node = v
                        used_edges.add((u, v))
                        used_edges.add((v, u))
                        break

            # If we haven't found a new starting target sequence, we are done
            if from_node is None:
                break

            # Initialize this object
            obj_header_idx += 1
            obj_header = "scf" + "{0:08}".format(obj_header_idx)
            obj_pos = 0
            obj_pid = 1

            # Process the first target sequence
            cur_ref_len = self.component_lens[cur_ref]
            cur_ref_strand = "+"
            if from_node.endswith("_b"):
                cur_ref_strand = "-"
            agp.add_seq_line(obj_header, obj_pos+1, obj_pos+cur_ref_len, obj_pid, "W", cur_ref, 1, cur_ref_len, cur_ref_strand)
            obj_pos += cur_ref_len
            obj_pid += 1
            used_components.add(cur_ref)

            # Process the remaining sequences.
            next_edge_exists = True
            while next_edge_exists:
                # Process the patch
                patch_aln = self.graph[from_node][to_node]["alignment"]
                patch_query = patch_aln.query
                patch_strand = "+"
                if patch_aln.strand:
                    patch_strand = "-"

                patch_len = patch_aln.their_query_start - patch_aln.my_query_end
                if patch_len > 0:
                    if patch_aln.is_gap:
                        agp.add_gap_line(obj_header, obj_pos+1, obj_pos+patch_len, obj_pid, "N", patch_len, "scaffold", "yes", "align_genus")
                    else:
                        agp.add_seq_line(obj_header, obj_pos+1, obj_pos+patch_len, obj_pid, "W", patch_query, patch_aln.my_query_end+1, patch_aln.their_query_start, patch_strand)
                        used_components.add(patch_query)
                    obj_pos += patch_len
                    obj_pid += 1

                # Next, process the reference sequence
                comp_start = min(0, patch_len)
                cur_ref = to_node[:-2]
                cur_ref_len = self.component_lens[cur_ref]
                cur_ref_strand = "+"
                if to_node.endswith("_e"):
                    cur_ref_strand = "-"
                agp.add_seq_line(obj_header, obj_pos+1, obj_pos+(cur_ref_len + comp_start), obj_pid, "W", cur_ref, 1+(-1*comp_start), cur_ref_len, cur_ref_strand)
                obj_pos += cur_ref_len + comp_start
                obj_pid += 1
                used_components.add(cur_ref)

                # Look for the next edge
                from_node = to_node[:-2] + "_b"
                if to_node.endswith("_b"):
                    from_node = to_node[:-2] + "_e"

                if from_node in self.graph.nodes:
                    next_nodes = set(self.graph[from_node])
                    assert len(next_nodes) == 1
                    to_node = next_nodes.pop()
                    used_edges.add((from_node, to_node))
                    used_edges.add((to_node, from_node))
                else:
                    next_edge_exists = False

        # Write unplaced reference sequences
        fai = pysam.FastaFile(ref_fn)
        all_ref_seqs = set(fai.references)
        fai.close()
        remaining_components = all_ref_seqs - used_components
        for c in sorted(remaining_components):
            agp.add_seq_line(
                c + "_RagTag" * add_suffix_to_unplaced,
                "1",
                str(self.component_lens[c]),
                "1",
                "W",
                c,
                "1",
                str(self.component_lens[c]),
                "+"
            )

        agp.write()
コード例 #7
0
def main():
    VERSION = get_ragtag_version()
    CITATION = """
Alonge, Michael, et al. "Automated assembly scaffolding elevates a new tomato system for high-throughput genome editing."
bioRxiv (2021). 
https://doi.org/10.1101/2021.11.18.469135

** RagTag supersedes RaGOO **

Alonge, Michael, et al. "RaGOO: fast and accurate reference-guided scaffolding of draft genomes."
Genome biology 20.1 (2019): 1-17.
https://doi.org/10.1186/s13059-019-1829-6
    """

    description = """
RagTag: Tools for fast and flexible genome assembly scaffolding and improvement.
Version: %s

usage: ragtag.py <command> [options]
    
    assembly improvement:
      correct         homology-based misassembly correction
      scaffold        homology-based assembly scaffolding
      patch           homology-based assembly patching
      merge           scaffold merging
      
    file utilities:
      agp2fa          build a FASTA file from an AGP file
      agpcheck        check for valid AGP file format
      asmstats        assembly statistics
      splitasm        split an assembly at gaps
      delta2paf       delta to PAF file conversion
      paf2delta       PAF to delta file conversion
      updategff       update gff intervals
      

    options:
      -c, --citation  
      -v, --version""" % VERSION

    arg_len = len(sys.argv)
    if arg_len == 1:
        print(description)

    if arg_len > 1:
        cmd = sys.argv[1]

        if cmd == "-h" or cmd == "--help":
            print(description)

        elif cmd == "-v" or cmd == "--version":
            print(VERSION)

        elif cmd == "-c" or cmd == "--citation":
            print(CITATION)

        elif cmd == "correct":
            subcmd = ["ragtag_correct.py"] + sys.argv[2:]
            subprocess.call(subcmd)

        elif cmd == "scaffold":
            subcmd = ["ragtag_scaffold.py"] + sys.argv[2:]
            subprocess.call(subcmd)

        elif cmd == "merge":
            subcmd = ["ragtag_merge.py"] + sys.argv[2:]
            subprocess.call(subcmd)

        elif cmd == "patch":
            subcmd = ["ragtag_patch.py"] + sys.argv[2:]
            subprocess.call(subcmd)

        elif cmd == "agp2fa":
            subcmd = ["ragtag_agp2fa.py"] + sys.argv[2:]
            subprocess.call(subcmd)

        elif cmd == "agpcheck":
            subcmd = ["ragtag_agpcheck.py"] + sys.argv[2:]
            subprocess.call(subcmd)

        elif cmd == "updategff":
            subcmd = ["ragtag_update_gff.py"] + sys.argv[2:]
            subprocess.call(subcmd)

        elif cmd == "asmstats":
            subcmd = ["ragtag_asmstats.py"] + sys.argv[2:]
            subprocess.call(subcmd)

        elif cmd == "splitasm":
            subcmd = ["ragtag_splitasm.py"] + sys.argv[2:]
            subprocess.call(subcmd)

        elif cmd == "delta2paf":
            subcmd = ["ragtag_delta2paf.py"] + sys.argv[2:]
            subprocess.call(subcmd)

        elif cmd == "paf2delta":
            subcmd = ["ragtag_paf2delta.py"] + sys.argv[2:]
            subprocess.call(subcmd)

        else:
            print(description)
            print("\n** unrecognized command: %s **" % cmd)
コード例 #8
0
ファイル: ragtag_scaffold.py プロジェクト: tclin422/RagTag
def write_orderings(out_agp_file, out_confidence_file, query_file,
                    ordering_dict, ctg_dict, gap_dict, gap_type_dict,
                    make_chr0, overwrite, add_suffix):
    # Check if the output file already exists
    if os.path.isfile(out_agp_file):
        if not overwrite:
            log("Retaining pre-existing file: " + out_agp_file)
            return

        else:
            log("Overwriting pre-existing file: " + out_agp_file)

    # Proceed with writing the intermediate output
    placed_seqs = set()
    all_out_cs_lines = []  # For confidence scores
    agp = AGPFile(out_agp_file, mode="w")

    agp.add_pragma()
    agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version()))

    # Go through the reference sequences in sorted order
    sorted_ref_headers = sorted(list(ordering_dict.keys()))
    for ref_header in sorted_ref_headers:
        pid = 1
        pos = 0
        new_ref_header = ref_header + "_RagTag"
        q_seqs = ordering_dict[ref_header]
        gap_seqs = gap_dict[ref_header]
        gap_types = gap_type_dict[ref_header]

        # Iterate through the query sequences for this reference header
        for i in range(len(q_seqs)):
            out_agp_line = []
            out_cs_line = []
            q = q_seqs[i][2]
            placed_seqs.add(q)
            qlen = ctg_dict[q].query_len
            strand = ctg_dict[q].orientation
            gc, lc, oc = ctg_dict[q].grouping_confidence, ctg_dict[
                q].location_confidence, ctg_dict[q].orientation_confidence
            out_agp_line.append(new_ref_header)
            out_agp_line.append(str(pos + 1))
            pos += qlen
            out_agp_line.append(str(pos))
            out_agp_line.append(str(pid))
            out_agp_line.append("W")
            out_agp_line.append(q)
            out_agp_line.append("1")
            out_agp_line.append(str(ctg_dict[q].query_len))
            out_agp_line.append(strand)

            # Save the confidence score info
            out_cs_line.append(q)
            out_cs_line.append(str(gc))
            out_cs_line.append(str(lc))
            out_cs_line.append(str(oc))

            agp.add_seq_line(*out_agp_line)
            all_out_cs_lines.append("\t".join(out_cs_line))
            pid += 1

            if i < len(gap_seqs):
                # Print the gap line
                out_agp_line = []
                out_agp_line.append(new_ref_header)
                out_agp_line.append(str(pos + 1))
                pos += gap_seqs[i]
                out_agp_line.append(str(pos))
                out_agp_line.append(str(pid))
                gap_type = gap_types[i]
                out_agp_line.append(gap_type)
                out_agp_line.append(str(gap_seqs[i]))
                out_agp_line.append("scaffold")
                out_agp_line.append("yes")
                out_agp_line.append("align_genus")
                pid += 1
                agp.add_gap_line(*out_agp_line)

    # Write unplaced sequences
    fai = pysam.FastaFile(query_file)
    all_seqs = set(fai.references)
    unplaced_seqs = sorted(list(all_seqs - placed_seqs))
    if unplaced_seqs:
        if make_chr0:
            pos = 0
            pid = 1
            new_ref_header = "Chr0_RagTag"
            for q in unplaced_seqs:
                out_agp_line = []
                qlen = fai.get_reference_length(q)
                out_agp_line.append(new_ref_header)
                out_agp_line.append(str(pos + 1))
                pos += qlen
                out_agp_line.append(str(pos))
                out_agp_line.append(str(pid))
                out_agp_line.append("W")
                out_agp_line.append(q)
                out_agp_line.append("1")
                out_agp_line.append(str(qlen))
                out_agp_line.append("+")

                agp.add_seq_line(*out_agp_line)
                pid += 1

                # Now for the gap, since we are making a chr0
                out_agp_line = []
                out_agp_line.append(new_ref_header)
                out_agp_line.append(str(pos + 1))
                pos += 100
                out_agp_line.append(str(pos))
                out_agp_line.append(str(pid))
                out_agp_line.append("U")
                out_agp_line.append("100")
                out_agp_line.append("contig")
                out_agp_line.append("no")
                out_agp_line.append("na")

                agp.add_gap_line(*out_agp_line)
                pid += 1

            # Remove the final unecessary gap
            agp.pop_agp_line()
        else:
            # List the unplaced contigs individually
            for q in unplaced_seqs:
                out_agp_line = []
                qlen = fai.get_reference_length(q)
                if add_suffix:
                    out_agp_line.append(q + "_RagTag")
                else:
                    out_agp_line.append(q)
                out_agp_line.append("1")
                out_agp_line.append(str(qlen))
                out_agp_line.append("1")
                out_agp_line.append("W")
                out_agp_line.append(q)
                out_agp_line.append("1")
                out_agp_line.append(str(qlen))
                out_agp_line.append("+")
                agp.add_seq_line(*out_agp_line)

    agp.write()
    fai.close()

    # Write the confidence scores
    with open(out_confidence_file, "w") as f:
        f.write(
            "query\tgrouping_confidence\tlocation_confidence\torientation_confidence\n"
        )
        f.write("\n".join(all_out_cs_lines) + "\n")
コード例 #9
0
ファイル: ragtag_scaffold.py プロジェクト: tclin422/RagTag
def main():
    parser = argparse.ArgumentParser(
        description='Reference-guided scaffolding',
        usage="ragtag.py scaffold <reference.fa> <query.fa>")

    parser.add_argument("reference",
                        metavar="<reference.fa>",
                        nargs='?',
                        default="",
                        type=str,
                        help="reference fasta file (uncompressed or bgzipped)")
    parser.add_argument("query",
                        metavar="<query.fa>",
                        nargs='?',
                        default="",
                        type=str,
                        help="query fasta file (uncompressed or bgzipped)")

    scaf_options = parser.add_argument_group("scaffolding options")
    scaf_options.add_argument(
        "-e",
        metavar="<exclude.txt>",
        type=str,
        default="",
        help="list of reference headers to ignore [null]")
    scaf_options.add_argument(
        "-j",
        metavar="<skip.txt>",
        type=str,
        default="",
        help="list of query headers to leave unplaced [null]")
    scaf_options.add_argument("-f",
                              metavar="INT",
                              type=int,
                              default=1000,
                              help="minimum unique alignment length [1000]")
    scaf_options.add_argument("--remove-small",
                              action="store_true",
                              default=False,
                              help="remove unique alignments shorter than -f")
    scaf_options.add_argument(
        "-q",
        metavar="INT",
        type=int,
        default=10,
        help="minimum mapq (NA for Nucmer alignments) [10]")
    scaf_options.add_argument("-d",
                              metavar="INT",
                              type=int,
                              default=100000,
                              help="alignment merge distance [100000]")
    scaf_options.add_argument("-i",
                              metavar="FLOAT",
                              type=float,
                              default=0.2,
                              help="minimum grouping confidence score [0.2]")
    scaf_options.add_argument("-a",
                              metavar="FLOAT",
                              type=float,
                              default=0.0,
                              help="minimum location confidence score [0.0]")
    scaf_options.add_argument(
        "-s",
        metavar="FLOAT",
        type=float,
        default=0.0,
        help="minimum orientation confidence score [0.0]")
    scaf_options.add_argument(
        "-C",
        action='store_true',
        default=False,
        help="concatenate unplaced contigs and make 'chr0'")
    scaf_options.add_argument(
        "-r",
        action='store_true',
        default=False,
        help="infer gap sizes. if not, all gaps are 100 bp")
    scaf_options.add_argument("-g",
                              metavar="INT",
                              type=int,
                              default=100,
                              help="minimum inferred gap size [100]")
    scaf_options.add_argument("-m",
                              metavar="INT",
                              type=int,
                              default=100000,
                              help="maximum inferred gap size [100000]")

    io_options = parser.add_argument_group("input/output options")
    io_options.add_argument("-o",
                            metavar="PATH",
                            type=str,
                            default="ragtag_output",
                            help="output directory [./ragtag_output]")
    io_options.add_argument("-w",
                            action='store_true',
                            default=False,
                            help="overwrite intermediate files")
    io_options.add_argument("-u",
                            action='store_true',
                            default=False,
                            help="add suffix to unplaced sequence headers")
    io_options.add_argument("--debug",
                            action='store_true',
                            default=False,
                            help=argparse.SUPPRESS)

    aln_options = parser.add_argument_group("mapping options")
    aln_options.add_argument("-t",
                             metavar="INT",
                             type=int,
                             default=1,
                             help="number of minimap2 threads [1]")
    aln_options.add_argument(
        "--aligner",
        metavar="PATH",
        type=str,
        default="minimap2",
        help="aligner executable ('nucmer' or 'minimap2') [minimap2]")
    mm2_default = "-x asm5"
    aln_options.add_argument(
        "--mm2-params",
        metavar="STR",
        type=str,
        default=mm2_default,
        help="space delimited minimap2 parameters ['%s']" % mm2_default)
    aln_options.add_argument(
        "--nucmer-params",
        metavar="STR",
        type=str,
        default="-l 100 -c 500",
        help="space delimted nucmer parameters ['-l 100 -c 500']")

    args = parser.parse_args()
    if not args.reference or not args.query:
        parser.print_help()
        print("\n** The reference and query FASTA files are required **")
        sys.exit()

    log("RagTag " + get_ragtag_version())
    log("CMD: ragtag.py scaffold " + " ".join(sys.argv[1:]))

    reference_file = os.path.abspath(args.reference)
    query_file = os.path.abspath(args.query)

    # Check that the reference/query file exists
    if not os.path.isfile(reference_file):
        raise ValueError("Could not find file: %s" % reference_file)

    if not os.path.isfile(query_file):
        raise ValueError("Could not find file: %s" % query_file)

    min_ulen = args.f
    keep_small_uniques = not args.remove_small
    merge_dist = args.d
    group_score_thresh = args.i
    loc_score_thresh = args.a
    orient_score_thresh = args.s
    make_chr0 = args.C
    infer_gaps = args.r
    num_threads = args.t

    # I/O options
    output_path = args.o
    if not os.path.isdir(output_path):
        os.mkdir(output_path)
    output_path = os.path.abspath(output_path) + "/"

    # Setup a log file for external RagTag scripts
    ragtag_log = output_path + "ragtag.scaffold.err"
    open(ragtag_log, "w").close()  # Wipe the log file

    overwrite_files = args.w
    remove_suffix = not args.u
    if remove_suffix:
        log("WARNING: Without '-u' invoked, some component/object AGP pairs might share the same ID. Some external programs/databases don't like this. To ensure valid AGP format, use '-u'."
            )

    # Gap options
    min_gap_size = args.g
    max_gap_size = args.m
    if min_gap_size < 1:
        raise ValueError("the minimum gap size must be positive")

    if max_gap_size < 1:
        raise ValueError("the maximum gap size must be positive")

    # Skip/exclude options
    query_blacklist = set()
    skip_file = args.j
    if skip_file:
        skip_file = os.path.abspath(args.j)
        with open(skip_file, "r") as f:
            for line in f:
                query_blacklist.add(line.rstrip())

    ref_blacklist = set()
    exclude_file = args.e
    if exclude_file:
        exclude_file = os.path.abspath(args.e)
        with open(exclude_file, "r") as f:
            for line in f:
                ref_blacklist.add(line.rstrip())

    # Get aligner arguments
    aligner_path = args.aligner
    aligner = aligner_path.split("/")[-1]
    if aligner.split("/")[-1] not in {'minimap2', 'nucmer'}:
        raise ValueError(
            "Must specify either 'minimap2' or 'nucmer' (PATHs allowed) with '--aligner'."
        )

    mm2_params = args.mm2_params
    nucmer_params = args.nucmer_params

    # Mapq filtering params
    min_mapq = args.q
    if aligner == "nucmer":
        min_mapq = 0

    # Add the number of mm2 threads if the mm2 params haven't been overridden.
    if mm2_params == mm2_default:
        mm2_params += " -t " + str(num_threads)

    # Debugging options
    debug_mode = args.debug
    debug_non_fltrd_file = output_path + "ragtag.scaffolds.debug.unfiltered.paf"
    debug_fltrd_file = output_path + "ragtag.scaffolds.debug.filtered.paf"
    debug_merged_file = output_path + "ragtag.scaffolds.debug.merged.paf"
    debug_query_info_file = output_path + "ragtag.scaffolds.debug.query.info.txt"

    # Align the query to the reference
    log("Mapping the query genome to the reference genome")
    if aligner == "minimap2":
        al = Minimap2Aligner(reference_file, [query_file],
                             aligner_path,
                             mm2_params,
                             output_path + "query_against_ref",
                             in_overwrite=overwrite_files)
    else:
        al = NucmerAligner(reference_file, [query_file],
                           aligner_path,
                           nucmer_params,
                           output_path + "query_against_ref",
                           in_overwrite=overwrite_files)
    al.run_aligner()

    # If alignments are from Nucmer, need to convert from delta to paf
    if aligner == "nucmer":
        cmd = ["ragtag_delta2paf.py", output_path + "query_against_ref.delta"]
        run_oae(cmd, output_path + "query_against_ref.paf", ragtag_log)

    # Read and organize the alignments
    log('Reading whole genome alignments')
    # ctg_alns = dict :: key=query header, value=ContigAlignment object
    ctg_alns = read_genome_alignments(output_path + "query_against_ref.paf",
                                      query_blacklist, ref_blacklist)

    # Filter the alignments
    if debug_mode:
        # create new empty copies of debugging output files
        open(debug_non_fltrd_file, "w").close()
        open(debug_fltrd_file, "w").close()
        open(debug_merged_file, "w").close()
        open(debug_query_info_file, "w").close()

    log("Filtering and merging alignments")
    for i in ctg_alns:

        # Write unfiltered alignments
        if debug_mode:
            with open(debug_non_fltrd_file, "a") as f:
                f.write(str(ctg_alns[i]))

        ctg_alns[i] = ctg_alns[i].unique_anchor_filter(
            min_ulen, keep_small=keep_small_uniques)
        if ctg_alns[i] is not None:
            ctg_alns[i] = ctg_alns[i].filter_mapq(min_mapq)
            if ctg_alns[i] is not None:

                # Write filtered alignments
                if debug_mode:
                    with open(debug_fltrd_file, "a") as f:
                        f.write(str(ctg_alns[i]))

                ctg_alns[i] = ctg_alns[i].merge_alns(merge_dist=merge_dist)

    # Remove query sequences which have no more qualifying alignments
    fltrd_ctg_alns = dict()
    for i in ctg_alns:
        if ctg_alns[i] is not None:

            # Write merged alignments and confidence scores
            if debug_mode:
                with open(debug_merged_file, "a") as f:
                    f.write(str(ctg_alns[i]))

                with open(debug_query_info_file, "a") as f:
                    f.write("\t".join([
                        i,
                        ctg_alns[i].best_ref_header,
                        str(ctg_alns[i].grouping_confidence),
                        str(ctg_alns[i].location_confidence),
                        str(ctg_alns[i].orientation_confidence),
                    ]) + "\n")

            if all([
                    ctg_alns[i].grouping_confidence > group_score_thresh,
                    ctg_alns[i].location_confidence > loc_score_thresh,
                    ctg_alns[i].orientation_confidence > orient_score_thresh
            ]):
                fltrd_ctg_alns[i] = ctg_alns[i]

    # For each reference sequence which has at least one assigned query sequence, get the list of
    # all query sequences assigned to that reference sequence.
    log("Ordering and orienting query sequences")
    mapped_ref_seqs = defaultdict(list)
    for i in fltrd_ctg_alns:
        best_ref = fltrd_ctg_alns[i].best_ref_header
        ref_start, ref_end = fltrd_ctg_alns[i].get_best_ref_pos()
        mapped_ref_seqs[best_ref].append((ref_start, ref_end, i))

    # Sort the query sequences for each reference sequence and define the padding sizes between adjacent query seqs
    g_inferred = 0
    g_small = 0
    g_large = 0
    pad_sizes = dict()
    gap_types = dict()
    for i in mapped_ref_seqs:
        # Remove contained contigs and sort the rest
        non_contained = remove_contained(mapped_ref_seqs[i])
        mapped_ref_seqs[i] = sorted(non_contained)
        if infer_gaps:
            # Infer the gap sizes between adjacent query seqs
            # Use the primary alignments to infer gap sizes
            pad_sizes[i] = []
            gap_types[i] = []
            for j in range(1, len(mapped_ref_seqs[i])):
                # Get info for the upstream alignment
                left_ctg = mapped_ref_seqs[i][j - 1][2]
                left_ref_start, left_ref_end = fltrd_ctg_alns[
                    left_ctg].get_best_ref_pos()
                left_qdist_start, left_qdist_end = fltrd_ctg_alns[
                    left_ctg].get_best_q_dist()

                # Get info for the downstream alignment
                right_ctg = mapped_ref_seqs[i][j][2]
                right_ref_start, right_ref_end = fltrd_ctg_alns[
                    right_ctg].get_best_ref_pos()
                right_qdist_start, right_qdist_end = fltrd_ctg_alns[
                    right_ctg].get_best_q_dist()

                # Get the inferred gap size
                i_gap_size = (right_ref_start - right_qdist_start) - (
                    left_ref_end + left_qdist_end)

                # Check if the gap size is too small or too large
                if i_gap_size <= min_gap_size:
                    pad_sizes[i].append(100)
                    gap_types[i].append("U")
                    g_small += 1
                elif i_gap_size > max_gap_size:
                    pad_sizes[i].append(100)
                    gap_types[i].append("U")
                    g_large += 1
                else:
                    pad_sizes[i].append(i_gap_size)
                    gap_types[i].append("N")
                    g_inferred += 1
        else:
            pad_sizes[i] = [100 for i in range(len(mapped_ref_seqs[i]) - 1)]
            gap_types[i] = ["U" for i in range(len(mapped_ref_seqs[i]) - 1)]

    if infer_gaps:
        log("%d inferred gap" % g_inferred)
        log("%d adjacent contig within min distance (%d) of each other" %
            (g_small, min_gap_size))
        log("%d inferred gaps exceed length threshold (%d)" %
            (g_large, max_gap_size))

    # Write the scaffolds
    log("Writing scaffolds")

    # Write the intermediate output file in AGP v2.1 format
    log("Writing: " + output_path + "ragtag.scaffolds.agp")
    write_orderings(output_path + "ragtag.scaffolds.agp",
                    output_path + "ragtag.confidence.txt", query_file,
                    mapped_ref_seqs, fltrd_ctg_alns, pad_sizes, gap_types,
                    make_chr0, True, not remove_suffix)

    # Build a FASTA from the AGP
    cmd = [
        "ragtag_agp2fasta.py", output_path + "ragtag.scaffolds.agp", query_file
    ]
    run_oae(cmd, output_path + "ragtag.scaffolds.fasta", ragtag_log)

    # Calculate the stats
    cmd = [
        "ragtag_stats.py", output_path + "ragtag.scaffolds.agp",
        output_path + "ragtag.confidence.txt"
    ]
    run_oae(cmd, output_path + "ragtag.scaffolds.stats", ragtag_log)

    log("Goodbye")
コード例 #10
0
ファイル: ragtag_merge.py プロジェクト: malonge/RagTag
def main():
    description = "Scaffold merging: derive a consensus scaffolding solution by reconciling distinct scaffoldings of " \
                  "'asm.fa'"
    parser = argparse.ArgumentParser(description=description, usage="ragtag.py merge <asm.fa> <scf1.agp> <scf2.agp> [...]")
    parser.add_argument("components", metavar="<asm.fasta>", nargs='?', default="", type=str, help="assembly fasta file (uncompressed or bgzipped)")
    parser.add_argument("agps", metavar="<scf1.agp> <scf2.agp> [...]", nargs='*', default=[], type=str, help="scaffolding AGP files")

    merge_options = parser.add_argument_group("merging options")
    merge_options.add_argument("-f", metavar="FILE", default="", type=str, help="CSV list of (AGP file,weight) [null]")
    merge_options.add_argument("-j", metavar="<skip.txt>", type=str, default="", help="list of query headers to leave unplaced [null]")
    merge_options.add_argument("-l", metavar="INT", default=100000, type=int, help="minimum assembly sequence length [100000]")
    merge_options.add_argument("-e", metavar="FLOAT", default=0.0, type=float, help="minimum edge weight. NA if using Hi-C [0.0]")
    merge_options.add_argument("--gap-func", metavar="STR", default="min", type=str, help="function for merging gap lengths {'min', 'max', or 'mean'} [min]")

    io_options = parser.add_argument_group("input/output options")
    io_options.add_argument("-o", metavar="PATH", type=str, default="ragtag_output", help="output directory [./ragtag_output]")
    io_options.add_argument("-w", action='store_true', default=False, help="overwrite intermediate files")
    io_options.add_argument("-u", action='store_true', default=False, help="add suffix to unplaced sequence headers")
    io_options.add_argument("--debug", action='store_true', default=False, help=argparse.SUPPRESS)

    hic_options = parser.add_argument_group("Hi-C options")
    hic_options.add_argument("-b", metavar="FILE", default="", type=str, help="Hi-C alignments in BAM format, sorted by read name [null]")
    hic_options.add_argument("-r", metavar="STR", default="GATC", type=str, help="CSV list of restriction enzymes/sites or 'DNase' [GATC]")
    hic_options.add_argument("-p", metavar="FLOAT", default=1.0, type=float, help="portion of the sequence termini to consider for links [1.0]")
    hic_options.add_argument("--list-enzymes", action='store_true', default=False, help="list all available restriction enzymes/sites")

    args = parser.parse_args()

    # Print a restriction enzyme help message if requested
    if args.list_enzymes:
        RestrictionEnzymes.get_info()
        sys.exit(0)

    if not args.components:
        parser.print_help()
        sys.exit("\n** The assembly FASTA file is required **")

    if not args.agps and not args.f:
        parser.print_help()
        sys.exit("\n** At least two AGP files are required **")

    log("VERSION", "RagTag " + get_ragtag_version())
    log("WARNING", "This is a beta version of `ragtag merge`")
    log("CMD", "ragtag.py merge " + " ".join(sys.argv[1:]))

    # Check that the components FASTA file exists
    comp_fname = args.components
    if not os.path.isfile(comp_fname):
        raise ValueError("Could not find file: %s" % comp_fname)

    # Optional arguments
    agp_fofn = args.f
    hic_bam_fname = args.b
    re_string = args.r
    portion = args.p

    # Set the minimum component sequence length
    min_comp_len = args.l
    if min_comp_len < 0:
        min_comp_len = 0

    # Set the minimum edge weight
    min_edge_weight = args.e
    if min_edge_weight < 0:
        min_edge_weight = 0

    # Set the gap merging function options
    gap_func = args.gap_func.upper()
    if gap_func not in {"MIN", "MAX", "MEAN"}:
        raise ValueError("Gap merging function must be either 'min', 'max', or 'mean'. Got: {}".format(args.gap_func))

    # Debugging options
    debug_mode = args.debug

    # I/O options
    output_path = args.o
    if not os.path.isdir(output_path):
        os.mkdir(output_path)
    output_path = os.path.abspath(output_path) + "/"
    file_prefix = "ragtag.merge"

    overwrite_files = args.w
    add_suffix = args.u
    if not add_suffix:
        log("WARNING", "Without '-u' invoked, some component/object AGP pairs might share the same ID. Some external programs/databases don't like this. To ensure valid AGP format, use '-u'.")

    # get the set of contigs to skip
    comp_exclusion_set = set()
    skip_fname = args.j
    if skip_fname:
        skip_fname = os.path.abspath(skip_fname)
        with open(skip_fname, "r") as f:
            for line in f:
                comp_exclusion_set.add(line.rstrip().split()[0])

    # Setup a file for general logging
    merge_log = output_path + file_prefix + ".err"
    open(merge_log, "w").close()  # Wipe the log file

    # Process the AGP files
    agp_list = [os.path.abspath(i) for i in args.agps]
    weight_list = [1 for _ in range(len(agp_list))]

    # Check for file of AGPs and weights
    if agp_fofn:
        agp_list, weight_list = [], []
        with open(agp_fofn, "r") as f:
            for line in f:
                fields = line.rstrip().split(",")
                agp_list.append(fields[0])
                weight_list.append(float(fields[1]))

    if len(agp_list) < 2:
        raise ValueError("At least two AGP files are required for merging")

    # Build the graph and filter nodes by sequence length
    log("INFO", "Building the scaffold graph from the AGP files")
    agp_multi_sg = AGPMultiScaffoldGraph(comp_fname)
    agp_multi_sg.add_agps(agp_list, in_weights=weight_list, exclusion_set=comp_exclusion_set)
    if min_comp_len:
        agp_multi_sg.filter_by_seq_len(min_comp_len)
    if debug_mode:
        nx.readwrite.gml.write_gml(agp_multi_sg.graph, output_path + "ragtag.merge.msg.gml")

    # Merge the SAG
    log("INFO", "Merging the scaffold graph")
    agp_sg = agp_multi_sg.merge()

    # Check if we are using Hi-C links to weight the graph.
    if hic_bam_fname:
        log("INFO", "Weighting the scaffold graph with Hi-C links")
        if not comp_fname or not re_string:
            raise RuntimeError("Hi-C requires alignments (-b) assembly sequences (-a) and restriction sites (-r)")

        cmd = [
            "ragtag_create_links.py",
            "-a", comp_fname,
            "-b", hic_bam_fname,
            "-r", re_string,
            "-p", str(portion)
        ]

        out_links_fname = output_path + file_prefix + ".links"
        if os.path.isfile(out_links_fname):
            if not overwrite_files:
                log("INFO", "Retaining pre-existing file: " + out_links_fname)
            else:
                run_oae(cmd, out_links_fname, merge_log)
        else:
            run_oae(cmd, out_links_fname, merge_log)

        hic_sg = build_hic_graph(out_links_fname, comp_fname)
        agp_sg = agp_sg.steal_weights_from(hic_sg)

    # Filter by edge weight
    if min_edge_weight and not hic_bam_fname:
        agp_sg.filter_by_weight(min_edge_weight)

    if debug_mode:
        agp_sg.connect_and_write_gml(output_path + file_prefix + ".sg.gml")

    # Compute a solution to the ScaffoldGraph
    log("INFO", "Computing a scaffolding solution")
    cover_graph = get_maximal_matching(agp_sg)
    if debug_mode:
        tmp_cover_graph = nx.Graph()
        for u, v in cover_graph.edges:
            tmp_cover_graph.add_edge(u, v)
        nx.readwrite.gml.write_gml(tmp_cover_graph, output_path + file_prefix + ".covergraph.gml")

    # Write the scaffolding output to an AGP file
    log("INFO", "Writing results")
    write_agp_solution(cover_graph, agp_sg, output_path + file_prefix + ".agp", gap_func=gap_func, add_suffix_to_unplaced=add_suffix)

    # Generate a FASTA file corresponding to the AGP
    cmd = [
        "ragtag_agp2fa.py",
        output_path + file_prefix + ".agp",
        comp_fname
    ]
    run_oae(cmd, output_path + file_prefix + ".fasta", merge_log)

    log("INFO", "Goodbye")
コード例 #11
0
ファイル: ragtag_merge.py プロジェクト: malonge/RagTag
def write_agp_solution(cover_graph, scaffold_graph, agp_fname, gap_func="MIN", add_suffix_to_unplaced=False):
    """
    Here, we work with two graphs: A cover_graph and a scaffold_graph. A covergrpah defines a solution to the scaffold
    graph, and nodes from the same component are connected for convenience.

    We use the scaffold_graph for any original scaffold_graph info/functionality
    """
    if not isinstance(scaffold_graph, ScaffoldGraphBase):
        raise TypeError("scaffold_graph must be an instance of ScaffoldGraph")

    placed_components = set()

    # Iterate over each connected component
    agp = AGPFile(agp_fname, mode="w")
    agp.add_pragma()
    agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version()))

    # Iterate through the connected components
    for i, cc in enumerate(nx.connected_components(G=cover_graph)):
        # Sort the list of nodes for deterministic output
        cc = sorted(list(cc))
        obj_header = "scf" + "{0:08}".format(i+1) + "_RagTag"
        current_node = None

        # Iterate over each node in the connected component until we find a node with degree=1
        for node in cc:
            if cover_graph.degree[node] == 1:
                current_node = node
                break

        assert current_node is not None

        # Starting with the degree=1 node, build the AGP object from nodes in the path.
        visited_nodes = {current_node}
        degree = 0
        obj_id = 1
        obj_pos = 0

        # Traverse the component until we find the other end node
        while degree != 1:
            conn_nodes = set(cover_graph.neighbors(current_node))
            next_node = (conn_nodes - visited_nodes).pop()
            degree = cover_graph.degree[next_node]
            comp_len = scaffold_graph.get_component_len(next_node[:-2])

            # Check if this is an intra or inter sequence edge
            orientation = "+"
            if next_node[:-2] == current_node[:-2]:
                if next_node.endswith("_b"):
                    orientation = "-"
                    assert current_node.endswith("_e")

                agp.add_seq_line(
                    obj_header,
                    str(obj_pos + 1),
                    str(obj_pos + comp_len),
                    str(obj_id),
                    "W",
                    next_node[:-2],
                    "1",
                    str(comp_len),
                    orientation
                )
                obj_pos += comp_len
                placed_components.add(next_node[:-2])
            else:
                # Organize the gap info
                adjacency_data = scaffold_graph[current_node][next_node]

                # AGP Column 5
                all_is_known_gap_size = adjacency_data["is_known_gap_size"]
                comp_type = "N" if any(all_is_known_gap_size) else "U"

                # AGP column 6b
                gap_size = 100
                all_gap_sizes = adjacency_data["gap_size"]
                fltrd_gap_sizes = [all_gap_sizes[i] for i in range(len(all_gap_sizes)) if all_is_known_gap_size[i]]
                if fltrd_gap_sizes:
                    if len(fltrd_gap_sizes) == 1:
                        gap_size = fltrd_gap_sizes[0]
                    else:
                        gap_size = get_gap_size(fltrd_gap_sizes, gap_func)

                # AGP column 7b
                all_gap_types = set(adjacency_data["gap_type"])
                gap_type = "scaffold"
                if len(all_gap_types) == 1:
                    gap_type = all_gap_types.pop()

                # AGP column 8b
                has_linkage = "yes" if any(adjacency_data["linkage"]) else "no"

                # AGP column 9b
                all_evidences = set(adjacency_data["linkage_evidence"])
                linkage_evidence = "na"
                if has_linkage == "yes":
                    if "na" in all_evidences:
                        all_evidences.remove("na")
                    linkage_evidence = ";".join([str(i) for i in all_evidences])

                agp.add_gap_line(
                    obj_header,
                    str(obj_pos + 1),
                    str(obj_pos + gap_size),
                    str(obj_id),
                    comp_type,
                    str(gap_size),
                    gap_type,
                    has_linkage,
                    linkage_evidence
                )
                obj_pos += gap_size

            obj_id += 1
            visited_nodes.add(next_node)
            current_node = next_node

    # Write all unplaced contigs
    remaining_components = scaffold_graph.components - placed_components
    for c in remaining_components:
        agp.add_seq_line(
            c + "_RagTag" * add_suffix_to_unplaced,
            "1",
            str(scaffold_graph.get_component_len(c)),
            "1",
            "W",
            c,
            "1",
            str(scaffold_graph.get_component_len(c)),
            "+"
        )

    agp.write()
コード例 #12
0
def main():
    VERSION = get_ragtag_version()
    CITATION = """
Alonge, Michael, et al. "RaGOO: fast and accurate reference-guided scaffolding of draft genomes."
Genome biology 20.1 (2019): 1-17.
    """

    description = """
RagTag: Reference-guided scaffolding and misassembly correction.
Version: %s

usage: ragtag.py <command> [options]
    
    assembly improvement:
      correct         misassembly correction 
      scaffold        synteny scaffolding
      merge           scaffold merging
      
    file utilities:
      agp2fasta       build a FASTA file from an AGP file
      agpcheck        check for valid AGP file format
      updategff       update gff intervals
      
    

    options:
      -c, --citation  
      -v, --version""" % VERSION

    arg_len = len(sys.argv)
    if arg_len == 1:
        print(description)

    if arg_len > 1:
        cmd = sys.argv[1]

        if cmd == "-h" or cmd == "--help":
            print(description)

        elif cmd == "-v" or cmd == "--version":
            print(VERSION)

        elif cmd == "-c" or cmd == "--citation":
            print(CITATION)

        elif cmd == "correct":
            subcmd = ["ragtag_correct.py"] + sys.argv[2:]
            subprocess.call(subcmd)

        elif cmd == "scaffold":
            subcmd = ["ragtag_scaffold.py"] + sys.argv[2:]
            subprocess.call(subcmd)

        elif cmd == "merge":
            subcmd = ["ragtag_merge.py"] + sys.argv[2:]
            subprocess.call(subcmd)

        elif cmd == "agp2fasta":
            subcmd = ["ragtag_agp2fasta.py"] + sys.argv[2:]
            subprocess.call(subcmd)

        elif cmd == "agpcheck":
            subcmd = ["ragtag_agpcheck.py"] + sys.argv[2:]
            subprocess.call(subcmd)

        elif cmd == "updategff":
            subcmd = ["ragtag_update_gff.py"] + sys.argv[2:]
            subprocess.call(subcmd)

        else:
            print(description)
            print("\n** unrecognized command: %s **" % cmd)
コード例 #13
0
def main():
    description = "Homology-based assembly patching: Make continuous joins and fill gaps " \
                  "in 'target.fa' using sequences from 'query.fa'"

    parser = argparse.ArgumentParser(description=description, usage="ragtag.py patch <target.fa> <query.fa>")

    parser.add_argument("reference", metavar="<target.fa>", nargs='?', default="", type=str, help="target fasta file (uncompressed or bgzipped)")
    parser.add_argument("query", metavar="<query.fa>", nargs='?', default="", type=str, help="query fasta file (uncompressed or bgzipped)")

    patch_options = parser.add_argument_group("patching")
    patch_options.add_argument("-e", metavar="<exclude.txt>", type=str, default="", help="list of target sequences to ignore [null]")
    patch_options.add_argument("-j", metavar="<skip.txt>", type=str, default="", help="list of query sequences to ignore [null]")
    patch_options.add_argument("-f", metavar="INT", type=int, default=1000, help="minimum unique alignment length [1000]")
    patch_options.add_argument("--remove-small", action="store_true", default=False, help="remove unique alignments shorter than '-f'")
    patch_options.add_argument("-q", metavar="INT", type=int, default=10, help="minimum mapq (NA for Nucmer alignments) [10]")
    patch_options.add_argument("-d", metavar="INT", type=int, default=100000, help="maximum alignment merge distance [100000]")
    patch_options.add_argument("-s", metavar="INT", type=int, default=50000, help="minimum merged alignment length [50000]")
    patch_options.add_argument("-i", metavar="FLOAT", type=float, default=0.05, help="maximum merged alignment distance from sequence terminus. fraction of the sequence length if < 1 [0.05]")
    patch_options.add_argument("--fill-only", action="store_true", default=False, help="only fill existing target gaps. do not join target sequences")
    patch_options.add_argument("--join-only", action="store_true", default=False, help="only join and patch target sequences. do not fill existing gaps")

    io_options = parser.add_argument_group("input/output options")
    io_options.add_argument("-o", metavar="PATH", type=str, default="ragtag_output", help="output directory [./ragtag_output]")
    io_options.add_argument("-w", action='store_true', default=False, help="overwrite intermediate files")
    io_options.add_argument("-u", action='store_true', default=False, help="add suffix to unplaced sequence headers")
    io_options.add_argument("--debug", action='store_true', default=False, help=argparse.SUPPRESS)

    aln_options = parser.add_argument_group("mapping options")
    aln_options.add_argument("-t", metavar="INT", type=int, default=1, help="number of minimap2/unimap threads [1]")
    aln_options.add_argument("--aligner", metavar="PATH", type=str, default="nucmer", help="aligner executable ('nucmer' (recommended), 'unimap' or 'minimap2') [nucmer]")
    mm2_default = "-x asm5"
    aln_options.add_argument("--mm2-params", metavar="STR", type=str, default=mm2_default, help="space delimited minimap2 parameters (overrides '-t') ['%s']" % mm2_default)
    aln_options.add_argument("--unimap-params", metavar="STR", type=str, default=mm2_default, help="space delimited unimap parameters (overrides '-t') ['%s']" % mm2_default)
    aln_options.add_argument("--nucmer-params", metavar="STR", type=str, default="--maxmatch -l 100 -c 500", help="space delimted nucmer parameters ['--maxmatch -l 100 -c 500']")

    args = parser.parse_args()
    if not args.reference or not args.query:
        parser.print_help()
        sys.exit("\n** The target and query FASTA files are required **")

    log("VERSION", "RagTag " + get_ragtag_version())
    log("WARNING", "This is a beta version of `ragtag patch`")
    log("CMD", "ragtag.py patch " + " ".join(sys.argv[1:]))

    reference_fn = os.path.abspath(args.reference)
    query_fn = os.path.abspath(args.query)

    # Check that the reference/query file exists
    if not os.path.isfile(reference_fn):
        raise FileNotFoundError("Could not find file: %s" % reference_fn)

    if not os.path.isfile(query_fn):
        raise FileNotFoundError("Could not find file: %s" % query_fn)

    # Alignment processing parameters
    min_ulen = args.f
    keep_small_uniques = not args.remove_small
    merge_dist = args.d
    num_threads = args.t

    aligner_path = args.aligner
    aligner = aligner_path.split("/")[-1]
    if aligner.split("/")[-1] not in {'minimap2', 'unimap', 'nucmer'}:
        raise ValueError("Must specify either 'minimap2', 'unimap', or 'nucmer' (PATHs allowed) with '--aligner'.")

    mm2_params = args.mm2_params
    unimap_params = args.unimap_params
    nucmer_params = args.nucmer_params

    # Mapq filtering parameters
    min_mapq = args.q
    if aligner == "nucmer":
        min_mapq = 0

    # Add the number of mm2/unimap threads if the mm2 params haven't been overridden.
    if mm2_params == mm2_default:
        mm2_params += " -t " + str(num_threads)
    if unimap_params == mm2_default:
        unimap_params += " -t " + str(num_threads)

    # Set reference/query sequences to ignore
    ref_blacklist = set()
    exclude_file = args.e
    if exclude_file:
        exclude_file = os.path.abspath(args.e)
        with open(exclude_file, "r") as f:
            for line in f:
                ref_blacklist.add(line.rstrip())

    query_blacklist = set()
    skip_file = args.j
    if skip_file:
        skip_file = os.path.abspath(skip_file)
        with open(skip_file, "r") as f:
            for line in f:
                query_blacklist.add(line.rstrip())

    # Supporting alignment parameters
    min_sup_aln_len = args.s
    max_term_dist = args.i
    if max_term_dist <= 0:
        raise ValueError("-i must be a positive nonzero number.")

    # Task options
    fill_only = args.fill_only
    join_only = args.join_only
    if fill_only and join_only:
        raise ValueError("'--fill-only' and '--join-only' cannot be used together")

    # I/O parameters
    add_suffix = args.u
    if not add_suffix:
        log("WARNING", "Without '-u' invoked, some component/object AGP pairs might share the same ID. Some external programs/databases don't like this. To ensure valid AGP format, use '-u'.")

    overwrite_files = args.w
    output_path = args.o
    if not os.path.isdir(output_path):
        os.mkdir(output_path)
    output_path = os.path.abspath(output_path) + "/"
    file_prefix = "ragtag.patch"

    # Setup a log file for external RagTag scripts
    ragtag_log = output_path + file_prefix + ".err"
    open(ragtag_log, "w").close()  # Wipe the log file

    # Debugging options
    debug_mode = args.debug

    # Break the reference assembly at gaps
    cmd = [
        "ragtag_splitasm.py",
        "-o",
        output_path + file_prefix + ".ctg.agp",
        reference_fn
    ]
    reference_ctg_fn = output_path + file_prefix + ".ctg.fasta"
    if os.path.isfile(reference_ctg_fn):
        if overwrite_files:
            log("INFO", "Overwriting pre-existing file: " + reference_ctg_fn)
            run_oae(cmd, reference_ctg_fn, ragtag_log)
        else:
            log("INFO", "Retaining pre-existing file: " + reference_ctg_fn)
    else:
        run_oae(cmd, reference_ctg_fn, ragtag_log)

    # Rename the query sequences
    cmd = [
        "ragtag_rename.py",
        query_fn,
        "-p",
        "qseq",
        "-o",
        output_path + file_prefix + ".rename.agp",
    ]
    query_rename_fn = output_path + file_prefix + ".rename.fasta"
    if os.path.isfile(query_rename_fn):
        if overwrite_files:
            log("INFO", "Overwriting pre-existing file: " + query_rename_fn)
            run_oae(cmd, query_rename_fn, ragtag_log)
        else:
            log("INFO", "Retaining pre-existing file: " + query_rename_fn)
    else:
        run_oae(cmd, query_rename_fn, ragtag_log)

    # Combine the reference contigs and query sequences to make a components fasta file
    components_fn = output_path + file_prefix + ".comps.fasta"
    if os.path.isfile(components_fn):
        if overwrite_files:
            log("INFO", "Overwriting pre-existing file: " + components_fn)
            write_comps = True
        else:
            log("INFO", "Retaining pre-existing file: " + components_fn)
            write_comps = False
    else:
        write_comps = True

    if write_comps:
        log("INFO", "Writing: " + components_fn)
        ref_fai = pysam.FastaFile(reference_ctg_fn)
        query_fai = pysam.FastaFile(query_rename_fn)
        with open(components_fn, "w") as f:
            for ref in ref_fai.references:
                f.write(">" + ref + "\n")
                f.write(ref_fai.fetch(ref) + "\n")

            for query in query_fai.references:
                f.write(">" + query + "\n")
                f.write(query_fai.fetch(query) + "\n")

    # Map the query assembly to the reference contigs
    log("INFO", "Mapping the query genome to the target genome")
    if aligner == "minimap2":
        al = Minimap2Aligner(reference_ctg_fn, [query_rename_fn], aligner_path, mm2_params, output_path + file_prefix + ".asm", in_overwrite=overwrite_files)
    elif aligner == "unimap":
        al = UnimapAligner(reference_ctg_fn, [query_rename_fn], aligner_path, unimap_params, output_path + file_prefix + ".asm", in_overwrite=overwrite_files)
    else:
        al = NucmerAligner(reference_ctg_fn, [query_rename_fn], aligner_path, nucmer_params, output_path + file_prefix + ".asm", in_overwrite=overwrite_files)
    al.run_aligner()

    # If alignments are from Nucmer, need to convert from delta to paf
    if aligner == "nucmer":
        cmd = ["ragtag_delta2paf.py", output_path + file_prefix + ".asm.delta"]
        run_oae(cmd, output_path + file_prefix + ".asm.paf", ragtag_log)

    # Read and organize the alignments
    log("INFO", "Reading whole genome alignments")
    # ctg_alns: query header -> ContigAlignment object
    ctg_alns = read_genome_alignments(output_path + file_prefix + ".asm.paf", query_blacklist, ref_blacklist)

    # Check if any alignments are left
    if not ctg_alns:
        raise RuntimeError("There are no alignments. Check '{}'.".format(output_path + file_prefix + ".asm.paf"))

    # Filter the alignments
    unfiltered_strings, filtered_strings, merged_strings, useful_strings = [], [], [], []
    log("INFO", "Filtering and merging alignments")
    fltrd_ctg_alns = dict()
    for i in ctg_alns:
        # Unique anchor filtering
        unfiltered_strings.append(str(ctg_alns[i]))
        ctg_alns[i] = ctg_alns[i].unique_anchor_filter(min_ulen, keep_small=keep_small_uniques)

        # mapq filtering
        if ctg_alns[i] is not None:
            ctg_alns[i] = ctg_alns[i].filter_mapq(min_mapq)
            if ctg_alns[i] is not None:
                filtered_strings.append(str(ctg_alns[i]))

                # alignment merging
                ctg_alns[i] = ctg_alns[i].merge_alns(merge_dist=merge_dist, careful_merge=True)
                if ctg_alns[i] is not None:
                    merged_strings.append(str(ctg_alns[i]))

                    # Length filtering
                    ctg_alns[i] = ctg_alns[i].filter_lengths(min_sup_aln_len)
                    if ctg_alns[i] is not None:
                        # terminal filtering
                        ctg_alns[i] = ctg_alns[i].keep_terminals(max_term_dist)

                        # Save the remaining useful alignments
                        if ctg_alns[i] is not None and ctg_alns[i].num_refs > 1 and not ctg_alns[i].has_internal_ref_cuttings(max_term_dist):
                            useful_strings.append(str(ctg_alns[i]))
                            fltrd_ctg_alns[i] = ctg_alns[i]

    # Write debugging files
    debug_non_fltrd_file = output_path + file_prefix + ".debug.unfiltered.paf"
    debug_fltrd_file = output_path + file_prefix + ".debug.filtered.paf"
    debug_merged_file = output_path + file_prefix + ".debug.merged.paf"
    debug_useful_file = output_path + file_prefix + ".debug.useful.paf"
    if debug_mode:
        with open(debug_non_fltrd_file, "w") as f:
            f.write("".join(unfiltered_strings))

        with open(debug_fltrd_file, "w") as f:
            f.write("".join(filtered_strings))

        with open(debug_merged_file, "w") as f:
            f.write("".join(merged_strings))

        with open(debug_useful_file, "w") as f:
            f.write("".join(useful_strings))

    # Make a Scaffold Graph encoding known reference contigs adjacencies
    log("INFO", "Building a scaffold graph from the contig AGP file")
    agp_multi_sg = AGPMultiScaffoldGraph(reference_ctg_fn)
    agp_multi_sg.add_agps([output_path + file_prefix + ".ctg.agp"])
    agp_sg = agp_multi_sg.merge()

    # As a hack, go through the AGP sg and make the required directed scaffold graph
    agp_psg = PatchScaffoldGraph(components_fn)
    for u, v in agp_sg.edges:
        aln = Alignment(
            u,
            v,
            "",
            agp_sg[u][v]["gap_size"][0],
            0,
            agp_sg[u][v]["gap_size"][0],
            0,
            is_gap=True
        )
        agp_psg.add_edge(u, v, aln)

    # Make a second directed scaffold graph from the alignments
    log("INFO", "Building a scaffold graph from the target/query mappings")
    aln_psg = build_aln_scaffold_graph(fltrd_ctg_alns, components_fn, max_term_dist)

    # Add edges for unfilled gaps
    for u, v in agp_psg.edges:
        if not aln_psg.has_edge(u, v):
            aln_psg.add_edge(u, v, agp_psg[u][v]["alignment"])

    # Remove known false edges
    for u, v in agp_psg.edges:
        for neighbor in list(aln_psg.neighbors(u)):
            if neighbor != v:
                aln_psg.remove_edge(u, neighbor)
                aln_psg.remove_edge(neighbor, u)

        for neighbor in list(aln_psg.neighbors(v)):
            if neighbor != u:
                aln_psg.remove_edge(neighbor, v)
                aln_psg.remove_edge(v, neighbor)

    # Adjust the graph depending on if only fills or joins are requested
    if fill_only:
        psg = PatchScaffoldGraph(components_fn)
        for u, v in agp_psg.edges:
            psg.add_edge(u, v, aln_psg[u][v]["alignment"])
            psg.add_edge(v, u, aln_psg[v][u]["alignment"])
        aln_psg = psg

    if join_only:
        for u, v in agp_psg.edges:
            aln_psg[u][v]["alignment"] = agp_psg[u][v]["alignment"]
            aln_psg[v][u]["alignment"] = agp_psg[v][u]["alignment"]

    if debug_mode:
        aln_psg.write_gml(output_path + file_prefix + ".debug.sg.gml")

    # Compute a matching solution for the graph
    log("INFO", "Computing a matching solution to the scaffold graph")
    match_psg = aln_psg.max_weight_matching()

    if debug_mode:
        match_psg.write_gml(output_path + file_prefix + ".debug.matching.gml")

    # Write the output in AGP format
    log("INFO", "Writing output files")
    match_psg.write_agp(output_path + file_prefix + ".agp", output_path + file_prefix + ".ctg.fasta", add_suffix_to_unplaced=add_suffix)

    # Write the output in fasta format
    cmd = [
        "ragtag_agp2fa.py",
        output_path + file_prefix + ".agp",
        components_fn
    ]
    run_oae(cmd, output_path + file_prefix + ".fasta", ragtag_log)

    log("INFO", "Goodbye")
コード例 #14
0
ファイル: ragtag_splitasm.py プロジェクト: malonge/RagTag
def main():
    parser = argparse.ArgumentParser(description='Split sequencs at gaps',
                                     usage="ragtag.py splitasm <asm.fa>")
    parser.add_argument("asm",
                        metavar="<asm.fa>",
                        default="",
                        type=str,
                        help="assembly fasta file (uncompressed or bgzipped)")
    parser.add_argument("-n",
                        metavar="INT",
                        type=int,
                        default=0,
                        help="minimum gap size [0]")
    parser.add_argument("-o",
                        metavar="PATH",
                        type=str,
                        default="ragtag.splitasm.agp",
                        help="output AGP file path [./ragtag.splitasm.agp]")

    # Parse the command line arguments
    args = parser.parse_args()
    if not args.asm:
        parser.print_help()
        print("\n** The assembly FASTA file is required **")
        sys.exit()

    asm_fn = args.asm
    min_gap_size = args.n
    agp_fn = args.o

    # Initialize the AGP file
    agp = AGPFile(agp_fn, mode="w")
    agp.add_pragma()
    agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version()))

    # Process the FASTA file
    new_header_idx = 0
    fai = pysam.FastaFile(asm_fn)
    for header in sorted(fai.references):
        seq = fai.fetch(header).upper()
        seq_len = fai.get_reference_length(header)
        gap_coords = [(i.start(), i.end()) for i in re.finditer(r'N+', seq)
                      if i.end() - i.start() > min_gap_size]

        if not gap_coords:
            new_header = "seq{0:08}".format(new_header_idx)
            new_header_idx += 1
            agp.add_seq_line(header, "1", seq_len, "1", "W", new_header, "1",
                             seq_len, "+")
        else:
            gap_coords.append((seq_len, seq_len + 1))
            pid = 1
            if gap_coords[0][0]:
                # The sequence doesn't start with a gap
                new_header = "seq{0:08}".format(new_header_idx)
                agp.add_seq_line(header, "1", str(gap_coords[0][0]),
                                 str(pid), "W", new_header, "1",
                                 str(gap_coords[0][0]), "+")
                new_header_idx += 1
                pid += 1

            for i in range(1, len(gap_coords)):
                # Add the gap line
                gap_start, gap_end = gap_coords[i - 1][0], gap_coords[i - 1][1]
                gap_len = gap_end - gap_start
                agp.add_gap_line(header, str(gap_start + 1), str(gap_end),
                                 str(pid), "N", str(gap_len), "scaffold",
                                 "yes", "align_genus")
                pid += 1

                # Add the sequence line
                obj_start, obj_end = gap_coords[i - 1][1], gap_coords[i][0]
                comp_len = obj_end - obj_start
                new_header = "seq{0:08}".format(new_header_idx)
                if gap_coords[i - 1][1] != seq_len:
                    agp.add_seq_line(header, str(obj_start + 1), obj_end, pid,
                                     "W", new_header, "1", str(comp_len), "+")
                    new_header_idx += 1
                    pid += 1

        agp.write()

    # Iterate over the AGP file and print the sequences
    agp = AGPFile(agp_fn, mode="r")
    for line in agp.iterate_lines():
        if not line.is_gap:
            obj, comp, obj_beg, obj_end = line.obj, line.comp, line.obj_beg, line.obj_end
            print(">" + comp)
            print(fai.fetch(obj, obj_beg - 1, obj_end))

    fai.close()