def write_breaks(out_file, query_file, ctg_breaks, overwrite, remove_suffix): """ Write the intermediate file for contig breaks in AGP v2.1 format.""" # Check if the output file already exists if os.path.isfile(out_file): if not overwrite: log("Retaining pre-existing file: " + out_file) return else: log("Overwriting pre-existing file: " + out_file) fai = pysam.FastaFile(query_file) all_q_seqs = sorted(fai.references) agp = AGPFile(out_file, mode="w") agp.add_pragma() agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version())) for q in all_q_seqs: # Check if this sequence was broken during misassembly correction if q not in ctg_breaks: # Add suffix to query header, unless otherwise requested unchanged_comp_header = q if not remove_suffix: unchanged_comp_header = q + ":0" + "-" + str( fai.get_reference_length(q)) + "(+)" agp.add_seq_line(q, "1", str(fai.get_reference_length(q)), "1", "W", unchanged_comp_header, "1", str(fai.get_reference_length(q)), "+") else: # This query sequence was broken pid = 1 sorted_breaks = sorted(ctg_breaks[q]) start = 0 for i in sorted_breaks: agp.add_seq_line(q, str(start + 1), str(i), str(pid), "W", q + ":" + str(start) + "-" + str(i) + "(+)", "1", str(i - start), "+") start = i pid += 1 # Add one line for the last interval agp.add_seq_line( q, str(start + 1), str(fai.get_reference_length(q)), str(pid), "W", q + ":" + str(start) + "-" + str(fai.get_reference_length(q)) + "(+)", "1", str(fai.get_reference_length(q) - start), "+") log("Writing: " + out_file) agp.write() fai.close()
def main(): VERSION = get_ragtag_version() CITATION = """ Alonge, Michael, et al. "RaGOO: fast and accurate reference-guided scaffolding of draft genomes." Genome biology 20.1 (2019): 1-17. """ description = """ RagTag: Reference-guided scaffolding and misassembly correction. Version: %s usage: ragtag.py <command> [options] commands: correct correct contig misassemblies scaffold scaffold contigs updategff update gff intervals options: -c, --citation -v, --version""" % VERSION arg_len = len(sys.argv) if arg_len == 1: print(description) if arg_len > 1: cmd = sys.argv[1] if cmd == "-h" or cmd == "--help": print(description) elif cmd == "-v" or cmd == "--version": print(VERSION) elif cmd == "-c" or cmd == "--citation": print(CITATION) elif cmd == "scaffold": subcmd = ["ragtag_scaffold.py"] + sys.argv[2:] subprocess.call(subcmd) elif cmd == "correct": subcmd = ["ragtag_correct.py"] + sys.argv[2:] subprocess.call(subcmd) elif cmd == "updategff": subcmd = ["ragtag_update_gff.py"] + sys.argv[2:] subprocess.call(subcmd) else: print(description) print("\n** unrecognized command: %s" % cmd)
def main(): parser = argparse.ArgumentParser( description="Update gff intervals given a RagTag AGP file", usage="ragtag.py updategff [-c] <genes.gff> <ragtag.agp>") parser.add_argument("gff", nargs='?', default="", metavar="<genes.gff>", type=str, help="gff file") parser.add_argument("agp", nargs='?', default="", metavar="<ragtag.*.agp>", type=str, help="agp file") parser.add_argument( "-c", action="store_true", default=False, help="update for misassembly correction (ragtag.correction.agp)") args = parser.parse_args() if not args.gff or not args.agp: parser.print_help() sys.exit() log("VERSION", "RagTag " + get_ragtag_version()) log("CMD", "ragtag.py updategff " + " ".join(sys.argv[1:])) gff_file = os.path.abspath(args.gff) agp_file = os.path.abspath(args.agp) is_sub = args.c if is_sub: sub_update(gff_file, agp_file) else: sup_update(gff_file, agp_file) log("INFO", "Goodbye")
#!/usr/bin/env python from setuptools import setup import glob from ragtag_utilities.utilities import get_ragtag_version with open("README.md", "r") as fh: long_description = fh.read() scripts = glob.glob("*.p*") version = get_ragtag_version()[1:] setup(name='RagTag', version=version, author='Michael Alonge', author_email='*****@*****.**', description='Fast reference-guided genome assembly scaffolding', long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/malonge/RagTag", packages=['ragtag_utilities'], package_dir={'ragtag_utilities': 'ragtag_utilities/'}, license="MIT", classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", ], install_requires=[ 'intervaltree',
def main(): parser = argparse.ArgumentParser( description='Reference-guided misassembly correction', usage="ragtag.py correct <reference.fa> <query.fa>") cor_options = parser.add_argument_group("correction options") cor_options.add_argument( "reference", metavar="<reference.fa>", nargs='?', default="", type=str, help="reference fasta file (can be uncompressed or bgzipped)") cor_options.add_argument( "query", metavar="<query.fa>", nargs='?', default="", type=str, help="query fasta file (can be uncompressed or bgzipped)") cor_options.add_argument("-f", metavar="INT", type=int, default=1000, help="minimum unique alignment length [1000]") cor_options.add_argument("--remove-small", action="store_true", default=False, help="remove unique alignments shorter than -f") cor_options.add_argument( "-q", metavar="INT", type=int, default=10, help="minimum mapq (NA for Nucmer alignments) [10]") cor_options.add_argument("-d", metavar="INT", type=int, default=100000, help="alignment merge distance [100000]") cor_options.add_argument( "-b", metavar="INT", type=int, default=5000, help="minimum break distance from contig ends [5000]") cor_options.add_argument("-e", metavar="<exclude.txt>", type=str, default="", help="list of reference headers to ignore") cor_options.add_argument("-j", metavar="<skip.txt>", type=str, default="", help="list of query headers to leave uncorrected") cor_options.add_argument( "--inter", action="store_true", default=False, help="only break misassemblies between reference sequences") cor_options.add_argument( "--intra", action="store_true", default=False, help="only break misassemblies within reference sequences") cor_options.add_argument("--gff", metavar="<features.gff>", type=str, default="", help="don't break sequences within gff intervals") io_options = parser.add_argument_group("input/output options") io_options.add_argument("-o", metavar="PATH", type=str, default="ragtag_output", help="output directory [./ragtag_output]") io_options.add_argument("-w", action='store_true', default=False, help="overwrite intermediate files") io_options.add_argument("-u", action='store_true', default=False, help="add suffix to unaltered sequence headers") io_options.add_argument("--debug", action='store_true', default=False, help=argparse.SUPPRESS) aln_options = parser.add_argument_group("mapping options") mm2_default = "-x asm5" aln_options.add_argument("-t", metavar="INT", type=int, default=1, help="number of minimap2 threads [1]") aln_options.add_argument( "--aligner", metavar="PATH", type=str, default="minimap2", help= "whole genome aligner executable ('nucmer' or 'minimap2') [minimap2]") aln_options.add_argument( "--mm2-params", metavar="STR", type=str, default=mm2_default, help="space delimited minimap2 whole genome alignment parameters ['%s']" % mm2_default) aln_options.add_argument( "--nucmer-params", metavar="STR", type=str, default="-l 100 -c 500", help= "space delimted nucmer whole genome alignment parameters ['-l 100 -c 500']" ) val_options = parser.add_argument_group("validation options") val_options.add_argument( "--read-aligner", metavar="PATH", type=str, default="minimap2", help="read aligner executable (only 'minimap2' is allowed) [minimap2]") val_options.add_argument( "-R", metavar="<reads.fasta>", type=str, default="", help="validation reads. gzipped fastq or fasta allowed.") val_options.add_argument("-F", metavar="<reads.fofn>", type=str, default="", help="same as '-R', but a list of files.") val_options.add_argument( "-T", metavar="sr", type=str, default="", help= "read type. 'sr' and 'corr' accepted for short reads and error corrected long-reads, respectively." ) val_options.add_argument("-v", metavar="INT", type=int, default=10000, help="coverage validation window size [10000]") val_options.add_argument( "--max-cov", metavar="INT", type=int, default=-1, help="break sequences at regions at or above this coverage level [AUTO]" ) val_options.add_argument( "--min-cov", metavar="INT", type=int, default=-1, help="break sequences at regions at or below this coverage level [AUTO]" ) val_options.add_argument( "-m", metavar="INT", type=int, default=1000, help=argparse.SUPPRESS ) # Merge breakpoints within this distance after validation args = parser.parse_args() if not args.reference or not args.query: parser.print_help() sys.exit() log("RagTag " + get_ragtag_version()) log("CMD: " + " ".join(sys.argv)) reference_file = os.path.abspath(args.reference) query_file = os.path.abspath(args.query) # Check that the reference/query file exists if not os.path.isfile(reference_file): raise ValueError("Could not find file: %s" % reference_file) if not os.path.isfile(query_file): raise ValueError("Could not find file: %s" % query_file) num_threads = args.t min_ulen = args.f keep_small_uniques = not args.remove_small merge_dist = args.d min_break_dist = args.m min_break_end_dist = args.b val_window_size = args.v # I/O options output_path = args.o if not os.path.isdir(output_path): os.mkdir(output_path) output_path = os.path.abspath(output_path) + "/" overwrite_files = args.w remove_suffix = not args.u if remove_suffix: log("WARNING: Without '-u' invoked, some component/object AGP pairs might share the same ID. Some external programs/databases don't like this. To ensure valid AGP format, use '-u'." ) gff_file = args.gff if gff_file: gff_file = os.path.abspath(gff_file) # Skip/exclude options query_blacklist = set() skip_file = args.j if skip_file: skip_file = os.path.abspath(args.j) with open(skip_file, "r") as f: for line in f: query_blacklist.add(line.rstrip()) ref_blacklist = set() exclude_file = args.e if exclude_file: exclude_file = os.path.abspath(args.e) with open(exclude_file, "r") as f: for line in f: ref_blacklist.add(line.rstrip()) # Get aligner arguments genome_aligner_path = args.aligner genome_aligner = genome_aligner_path.split("/")[-1] if genome_aligner.split("/")[-1] not in {'minimap2', 'nucmer'}: raise ValueError( "Must specify either 'minimap2' or 'nucmer' (PATHs allowed) with '--aligner'." ) mm2_params = args.mm2_params nucmer_params = args.nucmer_params # Mapq filtering params min_mapq = args.q if genome_aligner == "nucmer": min_mapq = 0 # Add the number of mm2 threads if the mm2 params haven't been overridden. if mm2_params == mm2_default: mm2_params += " -t " + str(num_threads) # Check if intra/inter breaking is desired break_intra = True break_inter = True only_intra = args.intra only_inter = args.inter if only_intra and only_inter: raise ValueError( "Must speficity either '--inter' or '--intra', not both.") if only_intra: break_inter = False if only_inter: break_intra = False # read-alignment parameters val_reads = args.R val_reads_fofn = args.F val_reads_tech = args.T read_aligner_path = args.read_aligner read_aligner = read_aligner_path.split("/")[-1] if read_aligner != "minimap2": raise ValueError( "Only minimap2 can be used for read alignments. got: %s" % read_aligner) # If the genome aligner is minimap2, we can just use that path for read alignment if genome_aligner == 'minimap2': read_aligner_path = genome_aligner_path # Make sure that if -R or -F, -T has been specified. if val_reads or val_reads_fofn: if not val_reads_tech: raise ValueError("'-T' must be provided when using -R or -F.") # Make a list of read sequences. read_files = [] if val_reads_fofn: with open(val_reads_fofn, "r") as f: for line in f: read_files.append(os.path.abspath(line.rstrip())) elif val_reads: read_files.append(os.path.abspath(val_reads)) # Coverage thresholds max_cov = args.max_cov min_cov = args.min_cov if max_cov < 0: if max_cov != -1: raise ValueError("--max-cov must be >=0") if min_cov < 0: if min_cov != -1: raise ValueError("--min-cov must be >=0") # Debugging options debug_mode = args.debug debug_non_fltrd_file = output_path + "ragtag.correction.debug.unfiltered.paf" debug_fltrd_file = output_path + "ragtag.correction.debug.filtered.paf" debug_merged_file = output_path + "ragtag.correction.debug.merged.paf" debug_query_info_file = output_path + "ragtag.correction.debug.query.info.txt" # Align the query to the reference. log("Mapping the query genome to the reference genome") if genome_aligner == "minimap2": al = Minimap2Aligner(reference_file, [query_file], genome_aligner_path, mm2_params, output_path + "c_query_against_ref", in_overwrite=overwrite_files) else: al = NucmerAligner(reference_file, [query_file], genome_aligner_path, nucmer_params, output_path + "c_query_against_ref", in_overwrite=overwrite_files) al.run_aligner() # If alignments are from Nucmer, convert from delta to paf. if genome_aligner == "nucmer": cmd = [ "ragtag_delta2paf.py", output_path + "c_query_against_ref.delta" ] run_o( cmd, output_path + "c_query_against_ref.paf", ) # Read and organize the alignments. log('Reading whole genome alignments') # ctg_alns = dict :: key=query header, value=ContigAlignment object ctg_alns = read_genome_alignments(output_path + "c_query_against_ref.paf", query_blacklist, ref_blacklist) # Filter and merge the alignments. if debug_mode: # create new empty copies of debugging output files open(debug_non_fltrd_file, "w").close() open(debug_fltrd_file, "w").close() open(debug_merged_file, "w").close() open(debug_query_info_file, "w").close() log("Filtering and merging alignments") for i in ctg_alns: # Write unfiltered alignments if debug_mode: with open(debug_non_fltrd_file, "a") as f: f.write(str(ctg_alns[i])) ctg_alns[i] = ctg_alns[i].unique_anchor_filter( min_ulen, keep_small=keep_small_uniques) if ctg_alns[i] is not None: ctg_alns[i] = ctg_alns[i].filter_mapq(min_mapq) if ctg_alns[i] is not None: # Write filtered alignments if debug_mode: with open(debug_fltrd_file, "a") as f: f.write(str(ctg_alns[i])) ctg_alns[i] = ctg_alns[i].merge_alns(merge_dist=merge_dist) # Get the putative breakpoints for each query sequence, if any. ctg_breaks = dict() for i in ctg_alns: if ctg_alns[i] is not None: # Write merged alignments and confidence scores if debug_mode: with open(debug_merged_file, "a") as f: f.write(str(ctg_alns[i])) with open(debug_query_info_file, "a") as f: f.write("\t".join([ i, ctg_alns[i].best_ref_header, str(ctg_alns[i].grouping_confidence), str(ctg_alns[i].location_confidence), str(ctg_alns[i].orientation_confidence), ]) + "\n") breaks = [] intra_breaks, inter_breaks = ctg_alns[i].get_break_candidates( min_dist=min_break_end_dist) if break_intra: breaks = breaks + intra_breaks if break_inter: breaks = breaks + inter_breaks if breaks: ctg_breaks[i] = breaks # If desired, validate the putative breakpoints by observing read coverage. if read_files: log("Validating putative query breakpoints via read alignment.") log("Aligning reads to query sequences.") if not os.path.isfile(output_path + "c_reads_against_query.s.bam"): if val_reads_tech == "sr": al = Minimap2SAMAligner(query_file, read_files, read_aligner_path, "-ax sr -t " + str(num_threads), output_path + "c_reads_against_query", in_overwrite=overwrite_files) elif val_reads_tech == "corr": al = Minimap2SAMAligner(query_file, read_files, read_aligner_path, "-ax asm5 -t " + str(num_threads), output_path + "c_reads_against_query", in_overwrite=overwrite_files) else: raise ValueError("'-T' must be either 'sr' or 'corr'.") al.run_aligner() else: log("Retaining pre-existing read alignments: " + output_path + "c_reads_against_query.s.bam") # Compress, sort and index the alignments. log("Compressing, sorting, and indexing read alignments") run_samtools(output_path, num_threads, overwrite_files) # Validate the breakpoints log("Validating putative query breakpoints") # Give at least 10k/1k from ctg ends for coverage to accumulate for corr and sr, respectively. val_min_break_end_dist = min_break_end_dist if val_reads_tech == "corr": val_min_break_end_dist = max(10000, min_break_end_dist) if val_reads_tech == "sr": val_min_break_end_dist = max(1000, min_break_end_dist) # Validate the breakpoints ctg_breaks = validate_breaks(ctg_breaks, output_path, num_threads, overwrite_files, val_min_break_end_dist, max_cov, min_cov, window_size=val_window_size, clean_dist=min_break_dist, debug=debug_mode) # Check if we need to avoid gff intervals if gff_file: log("Avoiding breaks within GFF intervals") it = make_gff_interval_tree(gff_file) non_gff_breaks = dict() for ctg in ctg_breaks: new_breaks = [] for i in ctg_breaks[ctg]: if it[ctg][i]: log("Avoiding breaking %s at %d. This point intersects a feature in the gff file." % (ctg, i)) else: new_breaks.append(i) if new_breaks: non_gff_breaks[ctg] = new_breaks ctg_breaks = non_gff_breaks # Write the summary of query sequence breaks in AGP format agp_file = output_path + "ragtag.correction.agp" write_breaks(agp_file, query_file, ctg_breaks, overwrite_files, remove_suffix) # Write the scaffolds. log("Writing broken contigs") qf_name = query_file.split("/")[-1] qf_pref = qf_name[:qf_name.rfind(".")] cmd = ["ragtag_break_query.py", agp_file, query_file] run_o(cmd, output_path + qf_pref + ".corrected.fasta") log("Goodbye")
def write_agp(self, agp_fn, ref_fn, add_suffix_to_unplaced=False): """ Write the AGP file implied by the scaffold graph :param agp_fn: AGP file name :param ref_fn: reference FASTA file name :param add_suffix_to_unplaced: add "_RagTag" to unscaffolded sequences """ used_components = set() used_edges = set() obj_header_idx = -1 agp = AGPFile(agp_fn, "w") agp.add_pragma() agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version())) while True: # Find a starting node from_node = None to_node = None cur_ref = None for u, v in sorted(self.edges): if (u, v) not in used_edges: u_base = u[:-2] u_degree = 0 if u_base + "_b" in self.nodes: u_degree += self.graph.degree[u_base + "_b"] if u_base + "_e" in self.nodes: u_degree += self.graph.degree[u_base + "_e"] assert u_degree in {2, 4} # Check if we have found a starting target sequence if u_degree == 2: cur_ref = u_base from_node = u to_node = v used_edges.add((u, v)) used_edges.add((v, u)) break # If we haven't found a new starting target sequence, we are done if from_node is None: break # Initialize this object obj_header_idx += 1 obj_header = "scf" + "{0:08}".format(obj_header_idx) obj_pos = 0 obj_pid = 1 # Process the first target sequence cur_ref_len = self.component_lens[cur_ref] cur_ref_strand = "+" if from_node.endswith("_b"): cur_ref_strand = "-" agp.add_seq_line(obj_header, obj_pos+1, obj_pos+cur_ref_len, obj_pid, "W", cur_ref, 1, cur_ref_len, cur_ref_strand) obj_pos += cur_ref_len obj_pid += 1 used_components.add(cur_ref) # Process the remaining sequences. next_edge_exists = True while next_edge_exists: # Process the patch patch_aln = self.graph[from_node][to_node]["alignment"] patch_query = patch_aln.query patch_strand = "+" if patch_aln.strand: patch_strand = "-" patch_len = patch_aln.their_query_start - patch_aln.my_query_end if patch_len > 0: if patch_aln.is_gap: agp.add_gap_line(obj_header, obj_pos+1, obj_pos+patch_len, obj_pid, "N", patch_len, "scaffold", "yes", "align_genus") else: agp.add_seq_line(obj_header, obj_pos+1, obj_pos+patch_len, obj_pid, "W", patch_query, patch_aln.my_query_end+1, patch_aln.their_query_start, patch_strand) used_components.add(patch_query) obj_pos += patch_len obj_pid += 1 # Next, process the reference sequence comp_start = min(0, patch_len) cur_ref = to_node[:-2] cur_ref_len = self.component_lens[cur_ref] cur_ref_strand = "+" if to_node.endswith("_e"): cur_ref_strand = "-" agp.add_seq_line(obj_header, obj_pos+1, obj_pos+(cur_ref_len + comp_start), obj_pid, "W", cur_ref, 1+(-1*comp_start), cur_ref_len, cur_ref_strand) obj_pos += cur_ref_len + comp_start obj_pid += 1 used_components.add(cur_ref) # Look for the next edge from_node = to_node[:-2] + "_b" if to_node.endswith("_b"): from_node = to_node[:-2] + "_e" if from_node in self.graph.nodes: next_nodes = set(self.graph[from_node]) assert len(next_nodes) == 1 to_node = next_nodes.pop() used_edges.add((from_node, to_node)) used_edges.add((to_node, from_node)) else: next_edge_exists = False # Write unplaced reference sequences fai = pysam.FastaFile(ref_fn) all_ref_seqs = set(fai.references) fai.close() remaining_components = all_ref_seqs - used_components for c in sorted(remaining_components): agp.add_seq_line( c + "_RagTag" * add_suffix_to_unplaced, "1", str(self.component_lens[c]), "1", "W", c, "1", str(self.component_lens[c]), "+" ) agp.write()
def main(): VERSION = get_ragtag_version() CITATION = """ Alonge, Michael, et al. "Automated assembly scaffolding elevates a new tomato system for high-throughput genome editing." bioRxiv (2021). https://doi.org/10.1101/2021.11.18.469135 ** RagTag supersedes RaGOO ** Alonge, Michael, et al. "RaGOO: fast and accurate reference-guided scaffolding of draft genomes." Genome biology 20.1 (2019): 1-17. https://doi.org/10.1186/s13059-019-1829-6 """ description = """ RagTag: Tools for fast and flexible genome assembly scaffolding and improvement. Version: %s usage: ragtag.py <command> [options] assembly improvement: correct homology-based misassembly correction scaffold homology-based assembly scaffolding patch homology-based assembly patching merge scaffold merging file utilities: agp2fa build a FASTA file from an AGP file agpcheck check for valid AGP file format asmstats assembly statistics splitasm split an assembly at gaps delta2paf delta to PAF file conversion paf2delta PAF to delta file conversion updategff update gff intervals options: -c, --citation -v, --version""" % VERSION arg_len = len(sys.argv) if arg_len == 1: print(description) if arg_len > 1: cmd = sys.argv[1] if cmd == "-h" or cmd == "--help": print(description) elif cmd == "-v" or cmd == "--version": print(VERSION) elif cmd == "-c" or cmd == "--citation": print(CITATION) elif cmd == "correct": subcmd = ["ragtag_correct.py"] + sys.argv[2:] subprocess.call(subcmd) elif cmd == "scaffold": subcmd = ["ragtag_scaffold.py"] + sys.argv[2:] subprocess.call(subcmd) elif cmd == "merge": subcmd = ["ragtag_merge.py"] + sys.argv[2:] subprocess.call(subcmd) elif cmd == "patch": subcmd = ["ragtag_patch.py"] + sys.argv[2:] subprocess.call(subcmd) elif cmd == "agp2fa": subcmd = ["ragtag_agp2fa.py"] + sys.argv[2:] subprocess.call(subcmd) elif cmd == "agpcheck": subcmd = ["ragtag_agpcheck.py"] + sys.argv[2:] subprocess.call(subcmd) elif cmd == "updategff": subcmd = ["ragtag_update_gff.py"] + sys.argv[2:] subprocess.call(subcmd) elif cmd == "asmstats": subcmd = ["ragtag_asmstats.py"] + sys.argv[2:] subprocess.call(subcmd) elif cmd == "splitasm": subcmd = ["ragtag_splitasm.py"] + sys.argv[2:] subprocess.call(subcmd) elif cmd == "delta2paf": subcmd = ["ragtag_delta2paf.py"] + sys.argv[2:] subprocess.call(subcmd) elif cmd == "paf2delta": subcmd = ["ragtag_paf2delta.py"] + sys.argv[2:] subprocess.call(subcmd) else: print(description) print("\n** unrecognized command: %s **" % cmd)
def write_orderings(out_agp_file, out_confidence_file, query_file, ordering_dict, ctg_dict, gap_dict, gap_type_dict, make_chr0, overwrite, add_suffix): # Check if the output file already exists if os.path.isfile(out_agp_file): if not overwrite: log("Retaining pre-existing file: " + out_agp_file) return else: log("Overwriting pre-existing file: " + out_agp_file) # Proceed with writing the intermediate output placed_seqs = set() all_out_cs_lines = [] # For confidence scores agp = AGPFile(out_agp_file, mode="w") agp.add_pragma() agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version())) # Go through the reference sequences in sorted order sorted_ref_headers = sorted(list(ordering_dict.keys())) for ref_header in sorted_ref_headers: pid = 1 pos = 0 new_ref_header = ref_header + "_RagTag" q_seqs = ordering_dict[ref_header] gap_seqs = gap_dict[ref_header] gap_types = gap_type_dict[ref_header] # Iterate through the query sequences for this reference header for i in range(len(q_seqs)): out_agp_line = [] out_cs_line = [] q = q_seqs[i][2] placed_seqs.add(q) qlen = ctg_dict[q].query_len strand = ctg_dict[q].orientation gc, lc, oc = ctg_dict[q].grouping_confidence, ctg_dict[ q].location_confidence, ctg_dict[q].orientation_confidence out_agp_line.append(new_ref_header) out_agp_line.append(str(pos + 1)) pos += qlen out_agp_line.append(str(pos)) out_agp_line.append(str(pid)) out_agp_line.append("W") out_agp_line.append(q) out_agp_line.append("1") out_agp_line.append(str(ctg_dict[q].query_len)) out_agp_line.append(strand) # Save the confidence score info out_cs_line.append(q) out_cs_line.append(str(gc)) out_cs_line.append(str(lc)) out_cs_line.append(str(oc)) agp.add_seq_line(*out_agp_line) all_out_cs_lines.append("\t".join(out_cs_line)) pid += 1 if i < len(gap_seqs): # Print the gap line out_agp_line = [] out_agp_line.append(new_ref_header) out_agp_line.append(str(pos + 1)) pos += gap_seqs[i] out_agp_line.append(str(pos)) out_agp_line.append(str(pid)) gap_type = gap_types[i] out_agp_line.append(gap_type) out_agp_line.append(str(gap_seqs[i])) out_agp_line.append("scaffold") out_agp_line.append("yes") out_agp_line.append("align_genus") pid += 1 agp.add_gap_line(*out_agp_line) # Write unplaced sequences fai = pysam.FastaFile(query_file) all_seqs = set(fai.references) unplaced_seqs = sorted(list(all_seqs - placed_seqs)) if unplaced_seqs: if make_chr0: pos = 0 pid = 1 new_ref_header = "Chr0_RagTag" for q in unplaced_seqs: out_agp_line = [] qlen = fai.get_reference_length(q) out_agp_line.append(new_ref_header) out_agp_line.append(str(pos + 1)) pos += qlen out_agp_line.append(str(pos)) out_agp_line.append(str(pid)) out_agp_line.append("W") out_agp_line.append(q) out_agp_line.append("1") out_agp_line.append(str(qlen)) out_agp_line.append("+") agp.add_seq_line(*out_agp_line) pid += 1 # Now for the gap, since we are making a chr0 out_agp_line = [] out_agp_line.append(new_ref_header) out_agp_line.append(str(pos + 1)) pos += 100 out_agp_line.append(str(pos)) out_agp_line.append(str(pid)) out_agp_line.append("U") out_agp_line.append("100") out_agp_line.append("contig") out_agp_line.append("no") out_agp_line.append("na") agp.add_gap_line(*out_agp_line) pid += 1 # Remove the final unecessary gap agp.pop_agp_line() else: # List the unplaced contigs individually for q in unplaced_seqs: out_agp_line = [] qlen = fai.get_reference_length(q) if add_suffix: out_agp_line.append(q + "_RagTag") else: out_agp_line.append(q) out_agp_line.append("1") out_agp_line.append(str(qlen)) out_agp_line.append("1") out_agp_line.append("W") out_agp_line.append(q) out_agp_line.append("1") out_agp_line.append(str(qlen)) out_agp_line.append("+") agp.add_seq_line(*out_agp_line) agp.write() fai.close() # Write the confidence scores with open(out_confidence_file, "w") as f: f.write( "query\tgrouping_confidence\tlocation_confidence\torientation_confidence\n" ) f.write("\n".join(all_out_cs_lines) + "\n")
def main(): parser = argparse.ArgumentParser( description='Reference-guided scaffolding', usage="ragtag.py scaffold <reference.fa> <query.fa>") parser.add_argument("reference", metavar="<reference.fa>", nargs='?', default="", type=str, help="reference fasta file (uncompressed or bgzipped)") parser.add_argument("query", metavar="<query.fa>", nargs='?', default="", type=str, help="query fasta file (uncompressed or bgzipped)") scaf_options = parser.add_argument_group("scaffolding options") scaf_options.add_argument( "-e", metavar="<exclude.txt>", type=str, default="", help="list of reference headers to ignore [null]") scaf_options.add_argument( "-j", metavar="<skip.txt>", type=str, default="", help="list of query headers to leave unplaced [null]") scaf_options.add_argument("-f", metavar="INT", type=int, default=1000, help="minimum unique alignment length [1000]") scaf_options.add_argument("--remove-small", action="store_true", default=False, help="remove unique alignments shorter than -f") scaf_options.add_argument( "-q", metavar="INT", type=int, default=10, help="minimum mapq (NA for Nucmer alignments) [10]") scaf_options.add_argument("-d", metavar="INT", type=int, default=100000, help="alignment merge distance [100000]") scaf_options.add_argument("-i", metavar="FLOAT", type=float, default=0.2, help="minimum grouping confidence score [0.2]") scaf_options.add_argument("-a", metavar="FLOAT", type=float, default=0.0, help="minimum location confidence score [0.0]") scaf_options.add_argument( "-s", metavar="FLOAT", type=float, default=0.0, help="minimum orientation confidence score [0.0]") scaf_options.add_argument( "-C", action='store_true', default=False, help="concatenate unplaced contigs and make 'chr0'") scaf_options.add_argument( "-r", action='store_true', default=False, help="infer gap sizes. if not, all gaps are 100 bp") scaf_options.add_argument("-g", metavar="INT", type=int, default=100, help="minimum inferred gap size [100]") scaf_options.add_argument("-m", metavar="INT", type=int, default=100000, help="maximum inferred gap size [100000]") io_options = parser.add_argument_group("input/output options") io_options.add_argument("-o", metavar="PATH", type=str, default="ragtag_output", help="output directory [./ragtag_output]") io_options.add_argument("-w", action='store_true', default=False, help="overwrite intermediate files") io_options.add_argument("-u", action='store_true', default=False, help="add suffix to unplaced sequence headers") io_options.add_argument("--debug", action='store_true', default=False, help=argparse.SUPPRESS) aln_options = parser.add_argument_group("mapping options") aln_options.add_argument("-t", metavar="INT", type=int, default=1, help="number of minimap2 threads [1]") aln_options.add_argument( "--aligner", metavar="PATH", type=str, default="minimap2", help="aligner executable ('nucmer' or 'minimap2') [minimap2]") mm2_default = "-x asm5" aln_options.add_argument( "--mm2-params", metavar="STR", type=str, default=mm2_default, help="space delimited minimap2 parameters ['%s']" % mm2_default) aln_options.add_argument( "--nucmer-params", metavar="STR", type=str, default="-l 100 -c 500", help="space delimted nucmer parameters ['-l 100 -c 500']") args = parser.parse_args() if not args.reference or not args.query: parser.print_help() print("\n** The reference and query FASTA files are required **") sys.exit() log("RagTag " + get_ragtag_version()) log("CMD: ragtag.py scaffold " + " ".join(sys.argv[1:])) reference_file = os.path.abspath(args.reference) query_file = os.path.abspath(args.query) # Check that the reference/query file exists if not os.path.isfile(reference_file): raise ValueError("Could not find file: %s" % reference_file) if not os.path.isfile(query_file): raise ValueError("Could not find file: %s" % query_file) min_ulen = args.f keep_small_uniques = not args.remove_small merge_dist = args.d group_score_thresh = args.i loc_score_thresh = args.a orient_score_thresh = args.s make_chr0 = args.C infer_gaps = args.r num_threads = args.t # I/O options output_path = args.o if not os.path.isdir(output_path): os.mkdir(output_path) output_path = os.path.abspath(output_path) + "/" # Setup a log file for external RagTag scripts ragtag_log = output_path + "ragtag.scaffold.err" open(ragtag_log, "w").close() # Wipe the log file overwrite_files = args.w remove_suffix = not args.u if remove_suffix: log("WARNING: Without '-u' invoked, some component/object AGP pairs might share the same ID. Some external programs/databases don't like this. To ensure valid AGP format, use '-u'." ) # Gap options min_gap_size = args.g max_gap_size = args.m if min_gap_size < 1: raise ValueError("the minimum gap size must be positive") if max_gap_size < 1: raise ValueError("the maximum gap size must be positive") # Skip/exclude options query_blacklist = set() skip_file = args.j if skip_file: skip_file = os.path.abspath(args.j) with open(skip_file, "r") as f: for line in f: query_blacklist.add(line.rstrip()) ref_blacklist = set() exclude_file = args.e if exclude_file: exclude_file = os.path.abspath(args.e) with open(exclude_file, "r") as f: for line in f: ref_blacklist.add(line.rstrip()) # Get aligner arguments aligner_path = args.aligner aligner = aligner_path.split("/")[-1] if aligner.split("/")[-1] not in {'minimap2', 'nucmer'}: raise ValueError( "Must specify either 'minimap2' or 'nucmer' (PATHs allowed) with '--aligner'." ) mm2_params = args.mm2_params nucmer_params = args.nucmer_params # Mapq filtering params min_mapq = args.q if aligner == "nucmer": min_mapq = 0 # Add the number of mm2 threads if the mm2 params haven't been overridden. if mm2_params == mm2_default: mm2_params += " -t " + str(num_threads) # Debugging options debug_mode = args.debug debug_non_fltrd_file = output_path + "ragtag.scaffolds.debug.unfiltered.paf" debug_fltrd_file = output_path + "ragtag.scaffolds.debug.filtered.paf" debug_merged_file = output_path + "ragtag.scaffolds.debug.merged.paf" debug_query_info_file = output_path + "ragtag.scaffolds.debug.query.info.txt" # Align the query to the reference log("Mapping the query genome to the reference genome") if aligner == "minimap2": al = Minimap2Aligner(reference_file, [query_file], aligner_path, mm2_params, output_path + "query_against_ref", in_overwrite=overwrite_files) else: al = NucmerAligner(reference_file, [query_file], aligner_path, nucmer_params, output_path + "query_against_ref", in_overwrite=overwrite_files) al.run_aligner() # If alignments are from Nucmer, need to convert from delta to paf if aligner == "nucmer": cmd = ["ragtag_delta2paf.py", output_path + "query_against_ref.delta"] run_oae(cmd, output_path + "query_against_ref.paf", ragtag_log) # Read and organize the alignments log('Reading whole genome alignments') # ctg_alns = dict :: key=query header, value=ContigAlignment object ctg_alns = read_genome_alignments(output_path + "query_against_ref.paf", query_blacklist, ref_blacklist) # Filter the alignments if debug_mode: # create new empty copies of debugging output files open(debug_non_fltrd_file, "w").close() open(debug_fltrd_file, "w").close() open(debug_merged_file, "w").close() open(debug_query_info_file, "w").close() log("Filtering and merging alignments") for i in ctg_alns: # Write unfiltered alignments if debug_mode: with open(debug_non_fltrd_file, "a") as f: f.write(str(ctg_alns[i])) ctg_alns[i] = ctg_alns[i].unique_anchor_filter( min_ulen, keep_small=keep_small_uniques) if ctg_alns[i] is not None: ctg_alns[i] = ctg_alns[i].filter_mapq(min_mapq) if ctg_alns[i] is not None: # Write filtered alignments if debug_mode: with open(debug_fltrd_file, "a") as f: f.write(str(ctg_alns[i])) ctg_alns[i] = ctg_alns[i].merge_alns(merge_dist=merge_dist) # Remove query sequences which have no more qualifying alignments fltrd_ctg_alns = dict() for i in ctg_alns: if ctg_alns[i] is not None: # Write merged alignments and confidence scores if debug_mode: with open(debug_merged_file, "a") as f: f.write(str(ctg_alns[i])) with open(debug_query_info_file, "a") as f: f.write("\t".join([ i, ctg_alns[i].best_ref_header, str(ctg_alns[i].grouping_confidence), str(ctg_alns[i].location_confidence), str(ctg_alns[i].orientation_confidence), ]) + "\n") if all([ ctg_alns[i].grouping_confidence > group_score_thresh, ctg_alns[i].location_confidence > loc_score_thresh, ctg_alns[i].orientation_confidence > orient_score_thresh ]): fltrd_ctg_alns[i] = ctg_alns[i] # For each reference sequence which has at least one assigned query sequence, get the list of # all query sequences assigned to that reference sequence. log("Ordering and orienting query sequences") mapped_ref_seqs = defaultdict(list) for i in fltrd_ctg_alns: best_ref = fltrd_ctg_alns[i].best_ref_header ref_start, ref_end = fltrd_ctg_alns[i].get_best_ref_pos() mapped_ref_seqs[best_ref].append((ref_start, ref_end, i)) # Sort the query sequences for each reference sequence and define the padding sizes between adjacent query seqs g_inferred = 0 g_small = 0 g_large = 0 pad_sizes = dict() gap_types = dict() for i in mapped_ref_seqs: # Remove contained contigs and sort the rest non_contained = remove_contained(mapped_ref_seqs[i]) mapped_ref_seqs[i] = sorted(non_contained) if infer_gaps: # Infer the gap sizes between adjacent query seqs # Use the primary alignments to infer gap sizes pad_sizes[i] = [] gap_types[i] = [] for j in range(1, len(mapped_ref_seqs[i])): # Get info for the upstream alignment left_ctg = mapped_ref_seqs[i][j - 1][2] left_ref_start, left_ref_end = fltrd_ctg_alns[ left_ctg].get_best_ref_pos() left_qdist_start, left_qdist_end = fltrd_ctg_alns[ left_ctg].get_best_q_dist() # Get info for the downstream alignment right_ctg = mapped_ref_seqs[i][j][2] right_ref_start, right_ref_end = fltrd_ctg_alns[ right_ctg].get_best_ref_pos() right_qdist_start, right_qdist_end = fltrd_ctg_alns[ right_ctg].get_best_q_dist() # Get the inferred gap size i_gap_size = (right_ref_start - right_qdist_start) - ( left_ref_end + left_qdist_end) # Check if the gap size is too small or too large if i_gap_size <= min_gap_size: pad_sizes[i].append(100) gap_types[i].append("U") g_small += 1 elif i_gap_size > max_gap_size: pad_sizes[i].append(100) gap_types[i].append("U") g_large += 1 else: pad_sizes[i].append(i_gap_size) gap_types[i].append("N") g_inferred += 1 else: pad_sizes[i] = [100 for i in range(len(mapped_ref_seqs[i]) - 1)] gap_types[i] = ["U" for i in range(len(mapped_ref_seqs[i]) - 1)] if infer_gaps: log("%d inferred gap" % g_inferred) log("%d adjacent contig within min distance (%d) of each other" % (g_small, min_gap_size)) log("%d inferred gaps exceed length threshold (%d)" % (g_large, max_gap_size)) # Write the scaffolds log("Writing scaffolds") # Write the intermediate output file in AGP v2.1 format log("Writing: " + output_path + "ragtag.scaffolds.agp") write_orderings(output_path + "ragtag.scaffolds.agp", output_path + "ragtag.confidence.txt", query_file, mapped_ref_seqs, fltrd_ctg_alns, pad_sizes, gap_types, make_chr0, True, not remove_suffix) # Build a FASTA from the AGP cmd = [ "ragtag_agp2fasta.py", output_path + "ragtag.scaffolds.agp", query_file ] run_oae(cmd, output_path + "ragtag.scaffolds.fasta", ragtag_log) # Calculate the stats cmd = [ "ragtag_stats.py", output_path + "ragtag.scaffolds.agp", output_path + "ragtag.confidence.txt" ] run_oae(cmd, output_path + "ragtag.scaffolds.stats", ragtag_log) log("Goodbye")
def main(): description = "Scaffold merging: derive a consensus scaffolding solution by reconciling distinct scaffoldings of " \ "'asm.fa'" parser = argparse.ArgumentParser(description=description, usage="ragtag.py merge <asm.fa> <scf1.agp> <scf2.agp> [...]") parser.add_argument("components", metavar="<asm.fasta>", nargs='?', default="", type=str, help="assembly fasta file (uncompressed or bgzipped)") parser.add_argument("agps", metavar="<scf1.agp> <scf2.agp> [...]", nargs='*', default=[], type=str, help="scaffolding AGP files") merge_options = parser.add_argument_group("merging options") merge_options.add_argument("-f", metavar="FILE", default="", type=str, help="CSV list of (AGP file,weight) [null]") merge_options.add_argument("-j", metavar="<skip.txt>", type=str, default="", help="list of query headers to leave unplaced [null]") merge_options.add_argument("-l", metavar="INT", default=100000, type=int, help="minimum assembly sequence length [100000]") merge_options.add_argument("-e", metavar="FLOAT", default=0.0, type=float, help="minimum edge weight. NA if using Hi-C [0.0]") merge_options.add_argument("--gap-func", metavar="STR", default="min", type=str, help="function for merging gap lengths {'min', 'max', or 'mean'} [min]") io_options = parser.add_argument_group("input/output options") io_options.add_argument("-o", metavar="PATH", type=str, default="ragtag_output", help="output directory [./ragtag_output]") io_options.add_argument("-w", action='store_true', default=False, help="overwrite intermediate files") io_options.add_argument("-u", action='store_true', default=False, help="add suffix to unplaced sequence headers") io_options.add_argument("--debug", action='store_true', default=False, help=argparse.SUPPRESS) hic_options = parser.add_argument_group("Hi-C options") hic_options.add_argument("-b", metavar="FILE", default="", type=str, help="Hi-C alignments in BAM format, sorted by read name [null]") hic_options.add_argument("-r", metavar="STR", default="GATC", type=str, help="CSV list of restriction enzymes/sites or 'DNase' [GATC]") hic_options.add_argument("-p", metavar="FLOAT", default=1.0, type=float, help="portion of the sequence termini to consider for links [1.0]") hic_options.add_argument("--list-enzymes", action='store_true', default=False, help="list all available restriction enzymes/sites") args = parser.parse_args() # Print a restriction enzyme help message if requested if args.list_enzymes: RestrictionEnzymes.get_info() sys.exit(0) if not args.components: parser.print_help() sys.exit("\n** The assembly FASTA file is required **") if not args.agps and not args.f: parser.print_help() sys.exit("\n** At least two AGP files are required **") log("VERSION", "RagTag " + get_ragtag_version()) log("WARNING", "This is a beta version of `ragtag merge`") log("CMD", "ragtag.py merge " + " ".join(sys.argv[1:])) # Check that the components FASTA file exists comp_fname = args.components if not os.path.isfile(comp_fname): raise ValueError("Could not find file: %s" % comp_fname) # Optional arguments agp_fofn = args.f hic_bam_fname = args.b re_string = args.r portion = args.p # Set the minimum component sequence length min_comp_len = args.l if min_comp_len < 0: min_comp_len = 0 # Set the minimum edge weight min_edge_weight = args.e if min_edge_weight < 0: min_edge_weight = 0 # Set the gap merging function options gap_func = args.gap_func.upper() if gap_func not in {"MIN", "MAX", "MEAN"}: raise ValueError("Gap merging function must be either 'min', 'max', or 'mean'. Got: {}".format(args.gap_func)) # Debugging options debug_mode = args.debug # I/O options output_path = args.o if not os.path.isdir(output_path): os.mkdir(output_path) output_path = os.path.abspath(output_path) + "/" file_prefix = "ragtag.merge" overwrite_files = args.w add_suffix = args.u if not add_suffix: log("WARNING", "Without '-u' invoked, some component/object AGP pairs might share the same ID. Some external programs/databases don't like this. To ensure valid AGP format, use '-u'.") # get the set of contigs to skip comp_exclusion_set = set() skip_fname = args.j if skip_fname: skip_fname = os.path.abspath(skip_fname) with open(skip_fname, "r") as f: for line in f: comp_exclusion_set.add(line.rstrip().split()[0]) # Setup a file for general logging merge_log = output_path + file_prefix + ".err" open(merge_log, "w").close() # Wipe the log file # Process the AGP files agp_list = [os.path.abspath(i) for i in args.agps] weight_list = [1 for _ in range(len(agp_list))] # Check for file of AGPs and weights if agp_fofn: agp_list, weight_list = [], [] with open(agp_fofn, "r") as f: for line in f: fields = line.rstrip().split(",") agp_list.append(fields[0]) weight_list.append(float(fields[1])) if len(agp_list) < 2: raise ValueError("At least two AGP files are required for merging") # Build the graph and filter nodes by sequence length log("INFO", "Building the scaffold graph from the AGP files") agp_multi_sg = AGPMultiScaffoldGraph(comp_fname) agp_multi_sg.add_agps(agp_list, in_weights=weight_list, exclusion_set=comp_exclusion_set) if min_comp_len: agp_multi_sg.filter_by_seq_len(min_comp_len) if debug_mode: nx.readwrite.gml.write_gml(agp_multi_sg.graph, output_path + "ragtag.merge.msg.gml") # Merge the SAG log("INFO", "Merging the scaffold graph") agp_sg = agp_multi_sg.merge() # Check if we are using Hi-C links to weight the graph. if hic_bam_fname: log("INFO", "Weighting the scaffold graph with Hi-C links") if not comp_fname or not re_string: raise RuntimeError("Hi-C requires alignments (-b) assembly sequences (-a) and restriction sites (-r)") cmd = [ "ragtag_create_links.py", "-a", comp_fname, "-b", hic_bam_fname, "-r", re_string, "-p", str(portion) ] out_links_fname = output_path + file_prefix + ".links" if os.path.isfile(out_links_fname): if not overwrite_files: log("INFO", "Retaining pre-existing file: " + out_links_fname) else: run_oae(cmd, out_links_fname, merge_log) else: run_oae(cmd, out_links_fname, merge_log) hic_sg = build_hic_graph(out_links_fname, comp_fname) agp_sg = agp_sg.steal_weights_from(hic_sg) # Filter by edge weight if min_edge_weight and not hic_bam_fname: agp_sg.filter_by_weight(min_edge_weight) if debug_mode: agp_sg.connect_and_write_gml(output_path + file_prefix + ".sg.gml") # Compute a solution to the ScaffoldGraph log("INFO", "Computing a scaffolding solution") cover_graph = get_maximal_matching(agp_sg) if debug_mode: tmp_cover_graph = nx.Graph() for u, v in cover_graph.edges: tmp_cover_graph.add_edge(u, v) nx.readwrite.gml.write_gml(tmp_cover_graph, output_path + file_prefix + ".covergraph.gml") # Write the scaffolding output to an AGP file log("INFO", "Writing results") write_agp_solution(cover_graph, agp_sg, output_path + file_prefix + ".agp", gap_func=gap_func, add_suffix_to_unplaced=add_suffix) # Generate a FASTA file corresponding to the AGP cmd = [ "ragtag_agp2fa.py", output_path + file_prefix + ".agp", comp_fname ] run_oae(cmd, output_path + file_prefix + ".fasta", merge_log) log("INFO", "Goodbye")
def write_agp_solution(cover_graph, scaffold_graph, agp_fname, gap_func="MIN", add_suffix_to_unplaced=False): """ Here, we work with two graphs: A cover_graph and a scaffold_graph. A covergrpah defines a solution to the scaffold graph, and nodes from the same component are connected for convenience. We use the scaffold_graph for any original scaffold_graph info/functionality """ if not isinstance(scaffold_graph, ScaffoldGraphBase): raise TypeError("scaffold_graph must be an instance of ScaffoldGraph") placed_components = set() # Iterate over each connected component agp = AGPFile(agp_fname, mode="w") agp.add_pragma() agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version())) # Iterate through the connected components for i, cc in enumerate(nx.connected_components(G=cover_graph)): # Sort the list of nodes for deterministic output cc = sorted(list(cc)) obj_header = "scf" + "{0:08}".format(i+1) + "_RagTag" current_node = None # Iterate over each node in the connected component until we find a node with degree=1 for node in cc: if cover_graph.degree[node] == 1: current_node = node break assert current_node is not None # Starting with the degree=1 node, build the AGP object from nodes in the path. visited_nodes = {current_node} degree = 0 obj_id = 1 obj_pos = 0 # Traverse the component until we find the other end node while degree != 1: conn_nodes = set(cover_graph.neighbors(current_node)) next_node = (conn_nodes - visited_nodes).pop() degree = cover_graph.degree[next_node] comp_len = scaffold_graph.get_component_len(next_node[:-2]) # Check if this is an intra or inter sequence edge orientation = "+" if next_node[:-2] == current_node[:-2]: if next_node.endswith("_b"): orientation = "-" assert current_node.endswith("_e") agp.add_seq_line( obj_header, str(obj_pos + 1), str(obj_pos + comp_len), str(obj_id), "W", next_node[:-2], "1", str(comp_len), orientation ) obj_pos += comp_len placed_components.add(next_node[:-2]) else: # Organize the gap info adjacency_data = scaffold_graph[current_node][next_node] # AGP Column 5 all_is_known_gap_size = adjacency_data["is_known_gap_size"] comp_type = "N" if any(all_is_known_gap_size) else "U" # AGP column 6b gap_size = 100 all_gap_sizes = adjacency_data["gap_size"] fltrd_gap_sizes = [all_gap_sizes[i] for i in range(len(all_gap_sizes)) if all_is_known_gap_size[i]] if fltrd_gap_sizes: if len(fltrd_gap_sizes) == 1: gap_size = fltrd_gap_sizes[0] else: gap_size = get_gap_size(fltrd_gap_sizes, gap_func) # AGP column 7b all_gap_types = set(adjacency_data["gap_type"]) gap_type = "scaffold" if len(all_gap_types) == 1: gap_type = all_gap_types.pop() # AGP column 8b has_linkage = "yes" if any(adjacency_data["linkage"]) else "no" # AGP column 9b all_evidences = set(adjacency_data["linkage_evidence"]) linkage_evidence = "na" if has_linkage == "yes": if "na" in all_evidences: all_evidences.remove("na") linkage_evidence = ";".join([str(i) for i in all_evidences]) agp.add_gap_line( obj_header, str(obj_pos + 1), str(obj_pos + gap_size), str(obj_id), comp_type, str(gap_size), gap_type, has_linkage, linkage_evidence ) obj_pos += gap_size obj_id += 1 visited_nodes.add(next_node) current_node = next_node # Write all unplaced contigs remaining_components = scaffold_graph.components - placed_components for c in remaining_components: agp.add_seq_line( c + "_RagTag" * add_suffix_to_unplaced, "1", str(scaffold_graph.get_component_len(c)), "1", "W", c, "1", str(scaffold_graph.get_component_len(c)), "+" ) agp.write()
def main(): VERSION = get_ragtag_version() CITATION = """ Alonge, Michael, et al. "RaGOO: fast and accurate reference-guided scaffolding of draft genomes." Genome biology 20.1 (2019): 1-17. """ description = """ RagTag: Reference-guided scaffolding and misassembly correction. Version: %s usage: ragtag.py <command> [options] assembly improvement: correct misassembly correction scaffold synteny scaffolding merge scaffold merging file utilities: agp2fasta build a FASTA file from an AGP file agpcheck check for valid AGP file format updategff update gff intervals options: -c, --citation -v, --version""" % VERSION arg_len = len(sys.argv) if arg_len == 1: print(description) if arg_len > 1: cmd = sys.argv[1] if cmd == "-h" or cmd == "--help": print(description) elif cmd == "-v" or cmd == "--version": print(VERSION) elif cmd == "-c" or cmd == "--citation": print(CITATION) elif cmd == "correct": subcmd = ["ragtag_correct.py"] + sys.argv[2:] subprocess.call(subcmd) elif cmd == "scaffold": subcmd = ["ragtag_scaffold.py"] + sys.argv[2:] subprocess.call(subcmd) elif cmd == "merge": subcmd = ["ragtag_merge.py"] + sys.argv[2:] subprocess.call(subcmd) elif cmd == "agp2fasta": subcmd = ["ragtag_agp2fasta.py"] + sys.argv[2:] subprocess.call(subcmd) elif cmd == "agpcheck": subcmd = ["ragtag_agpcheck.py"] + sys.argv[2:] subprocess.call(subcmd) elif cmd == "updategff": subcmd = ["ragtag_update_gff.py"] + sys.argv[2:] subprocess.call(subcmd) else: print(description) print("\n** unrecognized command: %s **" % cmd)
def main(): description = "Homology-based assembly patching: Make continuous joins and fill gaps " \ "in 'target.fa' using sequences from 'query.fa'" parser = argparse.ArgumentParser(description=description, usage="ragtag.py patch <target.fa> <query.fa>") parser.add_argument("reference", metavar="<target.fa>", nargs='?', default="", type=str, help="target fasta file (uncompressed or bgzipped)") parser.add_argument("query", metavar="<query.fa>", nargs='?', default="", type=str, help="query fasta file (uncompressed or bgzipped)") patch_options = parser.add_argument_group("patching") patch_options.add_argument("-e", metavar="<exclude.txt>", type=str, default="", help="list of target sequences to ignore [null]") patch_options.add_argument("-j", metavar="<skip.txt>", type=str, default="", help="list of query sequences to ignore [null]") patch_options.add_argument("-f", metavar="INT", type=int, default=1000, help="minimum unique alignment length [1000]") patch_options.add_argument("--remove-small", action="store_true", default=False, help="remove unique alignments shorter than '-f'") patch_options.add_argument("-q", metavar="INT", type=int, default=10, help="minimum mapq (NA for Nucmer alignments) [10]") patch_options.add_argument("-d", metavar="INT", type=int, default=100000, help="maximum alignment merge distance [100000]") patch_options.add_argument("-s", metavar="INT", type=int, default=50000, help="minimum merged alignment length [50000]") patch_options.add_argument("-i", metavar="FLOAT", type=float, default=0.05, help="maximum merged alignment distance from sequence terminus. fraction of the sequence length if < 1 [0.05]") patch_options.add_argument("--fill-only", action="store_true", default=False, help="only fill existing target gaps. do not join target sequences") patch_options.add_argument("--join-only", action="store_true", default=False, help="only join and patch target sequences. do not fill existing gaps") io_options = parser.add_argument_group("input/output options") io_options.add_argument("-o", metavar="PATH", type=str, default="ragtag_output", help="output directory [./ragtag_output]") io_options.add_argument("-w", action='store_true', default=False, help="overwrite intermediate files") io_options.add_argument("-u", action='store_true', default=False, help="add suffix to unplaced sequence headers") io_options.add_argument("--debug", action='store_true', default=False, help=argparse.SUPPRESS) aln_options = parser.add_argument_group("mapping options") aln_options.add_argument("-t", metavar="INT", type=int, default=1, help="number of minimap2/unimap threads [1]") aln_options.add_argument("--aligner", metavar="PATH", type=str, default="nucmer", help="aligner executable ('nucmer' (recommended), 'unimap' or 'minimap2') [nucmer]") mm2_default = "-x asm5" aln_options.add_argument("--mm2-params", metavar="STR", type=str, default=mm2_default, help="space delimited minimap2 parameters (overrides '-t') ['%s']" % mm2_default) aln_options.add_argument("--unimap-params", metavar="STR", type=str, default=mm2_default, help="space delimited unimap parameters (overrides '-t') ['%s']" % mm2_default) aln_options.add_argument("--nucmer-params", metavar="STR", type=str, default="--maxmatch -l 100 -c 500", help="space delimted nucmer parameters ['--maxmatch -l 100 -c 500']") args = parser.parse_args() if not args.reference or not args.query: parser.print_help() sys.exit("\n** The target and query FASTA files are required **") log("VERSION", "RagTag " + get_ragtag_version()) log("WARNING", "This is a beta version of `ragtag patch`") log("CMD", "ragtag.py patch " + " ".join(sys.argv[1:])) reference_fn = os.path.abspath(args.reference) query_fn = os.path.abspath(args.query) # Check that the reference/query file exists if not os.path.isfile(reference_fn): raise FileNotFoundError("Could not find file: %s" % reference_fn) if not os.path.isfile(query_fn): raise FileNotFoundError("Could not find file: %s" % query_fn) # Alignment processing parameters min_ulen = args.f keep_small_uniques = not args.remove_small merge_dist = args.d num_threads = args.t aligner_path = args.aligner aligner = aligner_path.split("/")[-1] if aligner.split("/")[-1] not in {'minimap2', 'unimap', 'nucmer'}: raise ValueError("Must specify either 'minimap2', 'unimap', or 'nucmer' (PATHs allowed) with '--aligner'.") mm2_params = args.mm2_params unimap_params = args.unimap_params nucmer_params = args.nucmer_params # Mapq filtering parameters min_mapq = args.q if aligner == "nucmer": min_mapq = 0 # Add the number of mm2/unimap threads if the mm2 params haven't been overridden. if mm2_params == mm2_default: mm2_params += " -t " + str(num_threads) if unimap_params == mm2_default: unimap_params += " -t " + str(num_threads) # Set reference/query sequences to ignore ref_blacklist = set() exclude_file = args.e if exclude_file: exclude_file = os.path.abspath(args.e) with open(exclude_file, "r") as f: for line in f: ref_blacklist.add(line.rstrip()) query_blacklist = set() skip_file = args.j if skip_file: skip_file = os.path.abspath(skip_file) with open(skip_file, "r") as f: for line in f: query_blacklist.add(line.rstrip()) # Supporting alignment parameters min_sup_aln_len = args.s max_term_dist = args.i if max_term_dist <= 0: raise ValueError("-i must be a positive nonzero number.") # Task options fill_only = args.fill_only join_only = args.join_only if fill_only and join_only: raise ValueError("'--fill-only' and '--join-only' cannot be used together") # I/O parameters add_suffix = args.u if not add_suffix: log("WARNING", "Without '-u' invoked, some component/object AGP pairs might share the same ID. Some external programs/databases don't like this. To ensure valid AGP format, use '-u'.") overwrite_files = args.w output_path = args.o if not os.path.isdir(output_path): os.mkdir(output_path) output_path = os.path.abspath(output_path) + "/" file_prefix = "ragtag.patch" # Setup a log file for external RagTag scripts ragtag_log = output_path + file_prefix + ".err" open(ragtag_log, "w").close() # Wipe the log file # Debugging options debug_mode = args.debug # Break the reference assembly at gaps cmd = [ "ragtag_splitasm.py", "-o", output_path + file_prefix + ".ctg.agp", reference_fn ] reference_ctg_fn = output_path + file_prefix + ".ctg.fasta" if os.path.isfile(reference_ctg_fn): if overwrite_files: log("INFO", "Overwriting pre-existing file: " + reference_ctg_fn) run_oae(cmd, reference_ctg_fn, ragtag_log) else: log("INFO", "Retaining pre-existing file: " + reference_ctg_fn) else: run_oae(cmd, reference_ctg_fn, ragtag_log) # Rename the query sequences cmd = [ "ragtag_rename.py", query_fn, "-p", "qseq", "-o", output_path + file_prefix + ".rename.agp", ] query_rename_fn = output_path + file_prefix + ".rename.fasta" if os.path.isfile(query_rename_fn): if overwrite_files: log("INFO", "Overwriting pre-existing file: " + query_rename_fn) run_oae(cmd, query_rename_fn, ragtag_log) else: log("INFO", "Retaining pre-existing file: " + query_rename_fn) else: run_oae(cmd, query_rename_fn, ragtag_log) # Combine the reference contigs and query sequences to make a components fasta file components_fn = output_path + file_prefix + ".comps.fasta" if os.path.isfile(components_fn): if overwrite_files: log("INFO", "Overwriting pre-existing file: " + components_fn) write_comps = True else: log("INFO", "Retaining pre-existing file: " + components_fn) write_comps = False else: write_comps = True if write_comps: log("INFO", "Writing: " + components_fn) ref_fai = pysam.FastaFile(reference_ctg_fn) query_fai = pysam.FastaFile(query_rename_fn) with open(components_fn, "w") as f: for ref in ref_fai.references: f.write(">" + ref + "\n") f.write(ref_fai.fetch(ref) + "\n") for query in query_fai.references: f.write(">" + query + "\n") f.write(query_fai.fetch(query) + "\n") # Map the query assembly to the reference contigs log("INFO", "Mapping the query genome to the target genome") if aligner == "minimap2": al = Minimap2Aligner(reference_ctg_fn, [query_rename_fn], aligner_path, mm2_params, output_path + file_prefix + ".asm", in_overwrite=overwrite_files) elif aligner == "unimap": al = UnimapAligner(reference_ctg_fn, [query_rename_fn], aligner_path, unimap_params, output_path + file_prefix + ".asm", in_overwrite=overwrite_files) else: al = NucmerAligner(reference_ctg_fn, [query_rename_fn], aligner_path, nucmer_params, output_path + file_prefix + ".asm", in_overwrite=overwrite_files) al.run_aligner() # If alignments are from Nucmer, need to convert from delta to paf if aligner == "nucmer": cmd = ["ragtag_delta2paf.py", output_path + file_prefix + ".asm.delta"] run_oae(cmd, output_path + file_prefix + ".asm.paf", ragtag_log) # Read and organize the alignments log("INFO", "Reading whole genome alignments") # ctg_alns: query header -> ContigAlignment object ctg_alns = read_genome_alignments(output_path + file_prefix + ".asm.paf", query_blacklist, ref_blacklist) # Check if any alignments are left if not ctg_alns: raise RuntimeError("There are no alignments. Check '{}'.".format(output_path + file_prefix + ".asm.paf")) # Filter the alignments unfiltered_strings, filtered_strings, merged_strings, useful_strings = [], [], [], [] log("INFO", "Filtering and merging alignments") fltrd_ctg_alns = dict() for i in ctg_alns: # Unique anchor filtering unfiltered_strings.append(str(ctg_alns[i])) ctg_alns[i] = ctg_alns[i].unique_anchor_filter(min_ulen, keep_small=keep_small_uniques) # mapq filtering if ctg_alns[i] is not None: ctg_alns[i] = ctg_alns[i].filter_mapq(min_mapq) if ctg_alns[i] is not None: filtered_strings.append(str(ctg_alns[i])) # alignment merging ctg_alns[i] = ctg_alns[i].merge_alns(merge_dist=merge_dist, careful_merge=True) if ctg_alns[i] is not None: merged_strings.append(str(ctg_alns[i])) # Length filtering ctg_alns[i] = ctg_alns[i].filter_lengths(min_sup_aln_len) if ctg_alns[i] is not None: # terminal filtering ctg_alns[i] = ctg_alns[i].keep_terminals(max_term_dist) # Save the remaining useful alignments if ctg_alns[i] is not None and ctg_alns[i].num_refs > 1 and not ctg_alns[i].has_internal_ref_cuttings(max_term_dist): useful_strings.append(str(ctg_alns[i])) fltrd_ctg_alns[i] = ctg_alns[i] # Write debugging files debug_non_fltrd_file = output_path + file_prefix + ".debug.unfiltered.paf" debug_fltrd_file = output_path + file_prefix + ".debug.filtered.paf" debug_merged_file = output_path + file_prefix + ".debug.merged.paf" debug_useful_file = output_path + file_prefix + ".debug.useful.paf" if debug_mode: with open(debug_non_fltrd_file, "w") as f: f.write("".join(unfiltered_strings)) with open(debug_fltrd_file, "w") as f: f.write("".join(filtered_strings)) with open(debug_merged_file, "w") as f: f.write("".join(merged_strings)) with open(debug_useful_file, "w") as f: f.write("".join(useful_strings)) # Make a Scaffold Graph encoding known reference contigs adjacencies log("INFO", "Building a scaffold graph from the contig AGP file") agp_multi_sg = AGPMultiScaffoldGraph(reference_ctg_fn) agp_multi_sg.add_agps([output_path + file_prefix + ".ctg.agp"]) agp_sg = agp_multi_sg.merge() # As a hack, go through the AGP sg and make the required directed scaffold graph agp_psg = PatchScaffoldGraph(components_fn) for u, v in agp_sg.edges: aln = Alignment( u, v, "", agp_sg[u][v]["gap_size"][0], 0, agp_sg[u][v]["gap_size"][0], 0, is_gap=True ) agp_psg.add_edge(u, v, aln) # Make a second directed scaffold graph from the alignments log("INFO", "Building a scaffold graph from the target/query mappings") aln_psg = build_aln_scaffold_graph(fltrd_ctg_alns, components_fn, max_term_dist) # Add edges for unfilled gaps for u, v in agp_psg.edges: if not aln_psg.has_edge(u, v): aln_psg.add_edge(u, v, agp_psg[u][v]["alignment"]) # Remove known false edges for u, v in agp_psg.edges: for neighbor in list(aln_psg.neighbors(u)): if neighbor != v: aln_psg.remove_edge(u, neighbor) aln_psg.remove_edge(neighbor, u) for neighbor in list(aln_psg.neighbors(v)): if neighbor != u: aln_psg.remove_edge(neighbor, v) aln_psg.remove_edge(v, neighbor) # Adjust the graph depending on if only fills or joins are requested if fill_only: psg = PatchScaffoldGraph(components_fn) for u, v in agp_psg.edges: psg.add_edge(u, v, aln_psg[u][v]["alignment"]) psg.add_edge(v, u, aln_psg[v][u]["alignment"]) aln_psg = psg if join_only: for u, v in agp_psg.edges: aln_psg[u][v]["alignment"] = agp_psg[u][v]["alignment"] aln_psg[v][u]["alignment"] = agp_psg[v][u]["alignment"] if debug_mode: aln_psg.write_gml(output_path + file_prefix + ".debug.sg.gml") # Compute a matching solution for the graph log("INFO", "Computing a matching solution to the scaffold graph") match_psg = aln_psg.max_weight_matching() if debug_mode: match_psg.write_gml(output_path + file_prefix + ".debug.matching.gml") # Write the output in AGP format log("INFO", "Writing output files") match_psg.write_agp(output_path + file_prefix + ".agp", output_path + file_prefix + ".ctg.fasta", add_suffix_to_unplaced=add_suffix) # Write the output in fasta format cmd = [ "ragtag_agp2fa.py", output_path + file_prefix + ".agp", components_fn ] run_oae(cmd, output_path + file_prefix + ".fasta", ragtag_log) log("INFO", "Goodbye")
def main(): parser = argparse.ArgumentParser(description='Split sequencs at gaps', usage="ragtag.py splitasm <asm.fa>") parser.add_argument("asm", metavar="<asm.fa>", default="", type=str, help="assembly fasta file (uncompressed or bgzipped)") parser.add_argument("-n", metavar="INT", type=int, default=0, help="minimum gap size [0]") parser.add_argument("-o", metavar="PATH", type=str, default="ragtag.splitasm.agp", help="output AGP file path [./ragtag.splitasm.agp]") # Parse the command line arguments args = parser.parse_args() if not args.asm: parser.print_help() print("\n** The assembly FASTA file is required **") sys.exit() asm_fn = args.asm min_gap_size = args.n agp_fn = args.o # Initialize the AGP file agp = AGPFile(agp_fn, mode="w") agp.add_pragma() agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version())) # Process the FASTA file new_header_idx = 0 fai = pysam.FastaFile(asm_fn) for header in sorted(fai.references): seq = fai.fetch(header).upper() seq_len = fai.get_reference_length(header) gap_coords = [(i.start(), i.end()) for i in re.finditer(r'N+', seq) if i.end() - i.start() > min_gap_size] if not gap_coords: new_header = "seq{0:08}".format(new_header_idx) new_header_idx += 1 agp.add_seq_line(header, "1", seq_len, "1", "W", new_header, "1", seq_len, "+") else: gap_coords.append((seq_len, seq_len + 1)) pid = 1 if gap_coords[0][0]: # The sequence doesn't start with a gap new_header = "seq{0:08}".format(new_header_idx) agp.add_seq_line(header, "1", str(gap_coords[0][0]), str(pid), "W", new_header, "1", str(gap_coords[0][0]), "+") new_header_idx += 1 pid += 1 for i in range(1, len(gap_coords)): # Add the gap line gap_start, gap_end = gap_coords[i - 1][0], gap_coords[i - 1][1] gap_len = gap_end - gap_start agp.add_gap_line(header, str(gap_start + 1), str(gap_end), str(pid), "N", str(gap_len), "scaffold", "yes", "align_genus") pid += 1 # Add the sequence line obj_start, obj_end = gap_coords[i - 1][1], gap_coords[i][0] comp_len = obj_end - obj_start new_header = "seq{0:08}".format(new_header_idx) if gap_coords[i - 1][1] != seq_len: agp.add_seq_line(header, str(obj_start + 1), obj_end, pid, "W", new_header, "1", str(comp_len), "+") new_header_idx += 1 pid += 1 agp.write() # Iterate over the AGP file and print the sequences agp = AGPFile(agp_fn, mode="r") for line in agp.iterate_lines(): if not line.is_gap: obj, comp, obj_beg, obj_end = line.obj, line.comp, line.obj_beg, line.obj_end print(">" + comp) print(fai.fetch(obj, obj_beg - 1, obj_end)) fai.close()