def write_breaks(out_file, query_file, ctg_breaks, overwrite, remove_suffix): """ Write the intermediate file for contig breaks in AGP v2.1 format.""" # Check if the output file already exists if os.path.isfile(out_file): if not overwrite: log("Retaining pre-existing file: " + out_file) return else: log("Overwriting pre-existing file: " + out_file) fai = pysam.FastaFile(query_file) all_q_seqs = sorted(fai.references) agp = AGPFile(out_file, mode="w") agp.add_pragma() agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version())) for q in all_q_seqs: # Check if this sequence was broken during misassembly correction if q not in ctg_breaks: # Add suffix to query header, unless otherwise requested unchanged_comp_header = q if not remove_suffix: unchanged_comp_header = q + ":0" + "-" + str( fai.get_reference_length(q)) + "(+)" agp.add_seq_line(q, "1", str(fai.get_reference_length(q)), "1", "W", unchanged_comp_header, "1", str(fai.get_reference_length(q)), "+") else: # This query sequence was broken pid = 1 sorted_breaks = sorted(ctg_breaks[q]) start = 0 for i in sorted_breaks: agp.add_seq_line(q, str(start + 1), str(i), str(pid), "W", q + ":" + str(start) + "-" + str(i) + "(+)", "1", str(i - start), "+") start = i pid += 1 # Add one line for the last interval agp.add_seq_line( q, str(start + 1), str(fai.get_reference_length(q)), str(pid), "W", q + ":" + str(start) + "-" + str(fai.get_reference_length(q)) + "(+)", "1", str(fai.get_reference_length(q) - start), "+") log("Writing: " + out_file) agp.write() fai.close()
def write_agp(self, agp_fn, ref_fn, add_suffix_to_unplaced=False): """ Write the AGP file implied by the scaffold graph :param agp_fn: AGP file name :param ref_fn: reference FASTA file name :param add_suffix_to_unplaced: add "_RagTag" to unscaffolded sequences """ used_components = set() used_edges = set() obj_header_idx = -1 agp = AGPFile(agp_fn, "w") agp.add_pragma() agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version())) while True: # Find a starting node from_node = None to_node = None cur_ref = None for u, v in sorted(self.edges): if (u, v) not in used_edges: u_base = u[:-2] u_degree = 0 if u_base + "_b" in self.nodes: u_degree += self.graph.degree[u_base + "_b"] if u_base + "_e" in self.nodes: u_degree += self.graph.degree[u_base + "_e"] assert u_degree in {2, 4} # Check if we have found a starting target sequence if u_degree == 2: cur_ref = u_base from_node = u to_node = v used_edges.add((u, v)) used_edges.add((v, u)) break # If we haven't found a new starting target sequence, we are done if from_node is None: break # Initialize this object obj_header_idx += 1 obj_header = "scf" + "{0:08}".format(obj_header_idx) obj_pos = 0 obj_pid = 1 # Process the first target sequence cur_ref_len = self.component_lens[cur_ref] cur_ref_strand = "+" if from_node.endswith("_b"): cur_ref_strand = "-" agp.add_seq_line(obj_header, obj_pos+1, obj_pos+cur_ref_len, obj_pid, "W", cur_ref, 1, cur_ref_len, cur_ref_strand) obj_pos += cur_ref_len obj_pid += 1 used_components.add(cur_ref) # Process the remaining sequences. next_edge_exists = True while next_edge_exists: # Process the patch patch_aln = self.graph[from_node][to_node]["alignment"] patch_query = patch_aln.query patch_strand = "+" if patch_aln.strand: patch_strand = "-" patch_len = patch_aln.their_query_start - patch_aln.my_query_end if patch_len > 0: if patch_aln.is_gap: agp.add_gap_line(obj_header, obj_pos+1, obj_pos+patch_len, obj_pid, "N", patch_len, "scaffold", "yes", "align_genus") else: agp.add_seq_line(obj_header, obj_pos+1, obj_pos+patch_len, obj_pid, "W", patch_query, patch_aln.my_query_end+1, patch_aln.their_query_start, patch_strand) used_components.add(patch_query) obj_pos += patch_len obj_pid += 1 # Next, process the reference sequence comp_start = min(0, patch_len) cur_ref = to_node[:-2] cur_ref_len = self.component_lens[cur_ref] cur_ref_strand = "+" if to_node.endswith("_e"): cur_ref_strand = "-" agp.add_seq_line(obj_header, obj_pos+1, obj_pos+(cur_ref_len + comp_start), obj_pid, "W", cur_ref, 1+(-1*comp_start), cur_ref_len, cur_ref_strand) obj_pos += cur_ref_len + comp_start obj_pid += 1 used_components.add(cur_ref) # Look for the next edge from_node = to_node[:-2] + "_b" if to_node.endswith("_b"): from_node = to_node[:-2] + "_e" if from_node in self.graph.nodes: next_nodes = set(self.graph[from_node]) assert len(next_nodes) == 1 to_node = next_nodes.pop() used_edges.add((from_node, to_node)) used_edges.add((to_node, from_node)) else: next_edge_exists = False # Write unplaced reference sequences fai = pysam.FastaFile(ref_fn) all_ref_seqs = set(fai.references) fai.close() remaining_components = all_ref_seqs - used_components for c in sorted(remaining_components): agp.add_seq_line( c + "_RagTag" * add_suffix_to_unplaced, "1", str(self.component_lens[c]), "1", "W", c, "1", str(self.component_lens[c]), "+" ) agp.write()
def write_orderings(out_agp_file, out_confidence_file, query_file, ordering_dict, ctg_dict, gap_dict, gap_type_dict, make_chr0, overwrite, add_suffix): # Check if the output file already exists if os.path.isfile(out_agp_file): if not overwrite: log("Retaining pre-existing file: " + out_agp_file) return else: log("Overwriting pre-existing file: " + out_agp_file) # Proceed with writing the intermediate output placed_seqs = set() all_out_cs_lines = [] # For confidence scores agp = AGPFile(out_agp_file, mode="w") agp.add_pragma() agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version())) # Go through the reference sequences in sorted order sorted_ref_headers = sorted(list(ordering_dict.keys())) for ref_header in sorted_ref_headers: pid = 1 pos = 0 new_ref_header = ref_header + "_RagTag" q_seqs = ordering_dict[ref_header] gap_seqs = gap_dict[ref_header] gap_types = gap_type_dict[ref_header] # Iterate through the query sequences for this reference header for i in range(len(q_seqs)): out_agp_line = [] out_cs_line = [] q = q_seqs[i][2] placed_seqs.add(q) qlen = ctg_dict[q].query_len strand = ctg_dict[q].orientation gc, lc, oc = ctg_dict[q].grouping_confidence, ctg_dict[ q].location_confidence, ctg_dict[q].orientation_confidence out_agp_line.append(new_ref_header) out_agp_line.append(str(pos + 1)) pos += qlen out_agp_line.append(str(pos)) out_agp_line.append(str(pid)) out_agp_line.append("W") out_agp_line.append(q) out_agp_line.append("1") out_agp_line.append(str(ctg_dict[q].query_len)) out_agp_line.append(strand) # Save the confidence score info out_cs_line.append(q) out_cs_line.append(str(gc)) out_cs_line.append(str(lc)) out_cs_line.append(str(oc)) agp.add_seq_line(*out_agp_line) all_out_cs_lines.append("\t".join(out_cs_line)) pid += 1 if i < len(gap_seqs): # Print the gap line out_agp_line = [] out_agp_line.append(new_ref_header) out_agp_line.append(str(pos + 1)) pos += gap_seqs[i] out_agp_line.append(str(pos)) out_agp_line.append(str(pid)) gap_type = gap_types[i] out_agp_line.append(gap_type) out_agp_line.append(str(gap_seqs[i])) out_agp_line.append("scaffold") out_agp_line.append("yes") out_agp_line.append("align_genus") pid += 1 agp.add_gap_line(*out_agp_line) # Write unplaced sequences fai = pysam.FastaFile(query_file) all_seqs = set(fai.references) unplaced_seqs = sorted(list(all_seqs - placed_seqs)) if unplaced_seqs: if make_chr0: pos = 0 pid = 1 new_ref_header = "Chr0_RagTag" for q in unplaced_seqs: out_agp_line = [] qlen = fai.get_reference_length(q) out_agp_line.append(new_ref_header) out_agp_line.append(str(pos + 1)) pos += qlen out_agp_line.append(str(pos)) out_agp_line.append(str(pid)) out_agp_line.append("W") out_agp_line.append(q) out_agp_line.append("1") out_agp_line.append(str(qlen)) out_agp_line.append("+") agp.add_seq_line(*out_agp_line) pid += 1 # Now for the gap, since we are making a chr0 out_agp_line = [] out_agp_line.append(new_ref_header) out_agp_line.append(str(pos + 1)) pos += 100 out_agp_line.append(str(pos)) out_agp_line.append(str(pid)) out_agp_line.append("U") out_agp_line.append("100") out_agp_line.append("contig") out_agp_line.append("no") out_agp_line.append("na") agp.add_gap_line(*out_agp_line) pid += 1 # Remove the final unecessary gap agp.pop_agp_line() else: # List the unplaced contigs individually for q in unplaced_seqs: out_agp_line = [] qlen = fai.get_reference_length(q) if add_suffix: out_agp_line.append(q + "_RagTag") else: out_agp_line.append(q) out_agp_line.append("1") out_agp_line.append(str(qlen)) out_agp_line.append("1") out_agp_line.append("W") out_agp_line.append(q) out_agp_line.append("1") out_agp_line.append(str(qlen)) out_agp_line.append("+") agp.add_seq_line(*out_agp_line) agp.write() fai.close() # Write the confidence scores with open(out_confidence_file, "w") as f: f.write( "query\tgrouping_confidence\tlocation_confidence\torientation_confidence\n" ) f.write("\n".join(all_out_cs_lines) + "\n")
def write_agp_solution(cover_graph, scaffold_graph, agp_fname, gap_func="MIN", add_suffix_to_unplaced=False): """ Here, we work with two graphs: A cover_graph and a scaffold_graph. A covergrpah defines a solution to the scaffold graph, and nodes from the same component are connected for convenience. We use the scaffold_graph for any original scaffold_graph info/functionality """ if not isinstance(scaffold_graph, ScaffoldGraphBase): raise TypeError("scaffold_graph must be an instance of ScaffoldGraph") placed_components = set() # Iterate over each connected component agp = AGPFile(agp_fname, mode="w") agp.add_pragma() agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version())) # Iterate through the connected components for i, cc in enumerate(nx.connected_components(G=cover_graph)): # Sort the list of nodes for deterministic output cc = sorted(list(cc)) obj_header = "scf" + "{0:08}".format(i+1) + "_RagTag" current_node = None # Iterate over each node in the connected component until we find a node with degree=1 for node in cc: if cover_graph.degree[node] == 1: current_node = node break assert current_node is not None # Starting with the degree=1 node, build the AGP object from nodes in the path. visited_nodes = {current_node} degree = 0 obj_id = 1 obj_pos = 0 # Traverse the component until we find the other end node while degree != 1: conn_nodes = set(cover_graph.neighbors(current_node)) next_node = (conn_nodes - visited_nodes).pop() degree = cover_graph.degree[next_node] comp_len = scaffold_graph.get_component_len(next_node[:-2]) # Check if this is an intra or inter sequence edge orientation = "+" if next_node[:-2] == current_node[:-2]: if next_node.endswith("_b"): orientation = "-" assert current_node.endswith("_e") agp.add_seq_line( obj_header, str(obj_pos + 1), str(obj_pos + comp_len), str(obj_id), "W", next_node[:-2], "1", str(comp_len), orientation ) obj_pos += comp_len placed_components.add(next_node[:-2]) else: # Organize the gap info adjacency_data = scaffold_graph[current_node][next_node] # AGP Column 5 all_is_known_gap_size = adjacency_data["is_known_gap_size"] comp_type = "N" if any(all_is_known_gap_size) else "U" # AGP column 6b gap_size = 100 all_gap_sizes = adjacency_data["gap_size"] fltrd_gap_sizes = [all_gap_sizes[i] for i in range(len(all_gap_sizes)) if all_is_known_gap_size[i]] if fltrd_gap_sizes: if len(fltrd_gap_sizes) == 1: gap_size = fltrd_gap_sizes[0] else: gap_size = get_gap_size(fltrd_gap_sizes, gap_func) # AGP column 7b all_gap_types = set(adjacency_data["gap_type"]) gap_type = "scaffold" if len(all_gap_types) == 1: gap_type = all_gap_types.pop() # AGP column 8b has_linkage = "yes" if any(adjacency_data["linkage"]) else "no" # AGP column 9b all_evidences = set(adjacency_data["linkage_evidence"]) linkage_evidence = "na" if has_linkage == "yes": if "na" in all_evidences: all_evidences.remove("na") linkage_evidence = ";".join([str(i) for i in all_evidences]) agp.add_gap_line( obj_header, str(obj_pos + 1), str(obj_pos + gap_size), str(obj_id), comp_type, str(gap_size), gap_type, has_linkage, linkage_evidence ) obj_pos += gap_size obj_id += 1 visited_nodes.add(next_node) current_node = next_node # Write all unplaced contigs remaining_components = scaffold_graph.components - placed_components for c in remaining_components: agp.add_seq_line( c + "_RagTag" * add_suffix_to_unplaced, "1", str(scaffold_graph.get_component_len(c)), "1", "W", c, "1", str(scaffold_graph.get_component_len(c)), "+" ) agp.write()
def main(): parser = argparse.ArgumentParser(description='Split sequencs at gaps', usage="ragtag.py splitasm <asm.fa>") parser.add_argument("asm", metavar="<asm.fa>", default="", type=str, help="assembly fasta file (uncompressed or bgzipped)") parser.add_argument("-n", metavar="INT", type=int, default=0, help="minimum gap size [0]") parser.add_argument("-o", metavar="PATH", type=str, default="ragtag.splitasm.agp", help="output AGP file path [./ragtag.splitasm.agp]") # Parse the command line arguments args = parser.parse_args() if not args.asm: parser.print_help() print("\n** The assembly FASTA file is required **") sys.exit() asm_fn = args.asm min_gap_size = args.n agp_fn = args.o # Initialize the AGP file agp = AGPFile(agp_fn, mode="w") agp.add_pragma() agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version())) # Process the FASTA file new_header_idx = 0 fai = pysam.FastaFile(asm_fn) for header in sorted(fai.references): seq = fai.fetch(header).upper() seq_len = fai.get_reference_length(header) gap_coords = [(i.start(), i.end()) for i in re.finditer(r'N+', seq) if i.end() - i.start() > min_gap_size] if not gap_coords: new_header = "seq{0:08}".format(new_header_idx) new_header_idx += 1 agp.add_seq_line(header, "1", seq_len, "1", "W", new_header, "1", seq_len, "+") else: gap_coords.append((seq_len, seq_len + 1)) pid = 1 if gap_coords[0][0]: # The sequence doesn't start with a gap new_header = "seq{0:08}".format(new_header_idx) agp.add_seq_line(header, "1", str(gap_coords[0][0]), str(pid), "W", new_header, "1", str(gap_coords[0][0]), "+") new_header_idx += 1 pid += 1 for i in range(1, len(gap_coords)): # Add the gap line gap_start, gap_end = gap_coords[i - 1][0], gap_coords[i - 1][1] gap_len = gap_end - gap_start agp.add_gap_line(header, str(gap_start + 1), str(gap_end), str(pid), "N", str(gap_len), "scaffold", "yes", "align_genus") pid += 1 # Add the sequence line obj_start, obj_end = gap_coords[i - 1][1], gap_coords[i][0] comp_len = obj_end - obj_start new_header = "seq{0:08}".format(new_header_idx) if gap_coords[i - 1][1] != seq_len: agp.add_seq_line(header, str(obj_start + 1), obj_end, pid, "W", new_header, "1", str(comp_len), "+") new_header_idx += 1 pid += 1 agp.write() # Iterate over the AGP file and print the sequences agp = AGPFile(agp_fn, mode="r") for line in agp.iterate_lines(): if not line.is_gap: obj, comp, obj_beg, obj_end = line.obj, line.comp, line.obj_beg, line.obj_end print(">" + comp) print(fai.fetch(obj, obj_beg - 1, obj_end)) fai.close()