def main(): parser = argparse.ArgumentParser( description="Build scaffolds from an 'orderings.bed' file") parser.add_argument("agp", metavar="<ragtag.correction.agp>", type=str, help="AGP v2.1 file produced by 'ragtag.py correct'") parser.add_argument( "query", metavar="<query.fasta>", type=str, help="query fasta file to be scaffolded. must not be gzipped") args = parser.parse_args() agp_file = args.agp query_file = args.query x = pysam.FastaFile(query_file) agp = AGPFile(agp_file) # Iterate through the agp file for line in agp.iterate_lines(): if line.is_gap: raise ValueError("The AGP file should have no gaps.") if line.orientation == "-": raise ValueError("No sequences should have a '-' orientation.") start, end = int(line.obj_beg) - 1, int(line.obj_end) print(">" + line.comp) print(x.fetch(line.obj, start, end))
def main(): parser = argparse.ArgumentParser( description="Break corrected query sequences (objects) into components." ) parser.add_argument("agp", metavar="<ragtag.correction.agp>", type=str, help="AGP v2.1 file produced by 'ragtag.py correct'") parser.add_argument( "query", metavar="<query.fasta>", type=str, help= "query fasta file corresponding to objects in <ragtag.correction.agp> (can be uncompressed or bgzipped" ) args = parser.parse_args() agp_file = args.agp query_file = args.query fai = pysam.FastaFile(query_file) agp = AGPFile(agp_file, mode="r") # Iterate through the agp file for line in agp.iterate_lines(): if line.is_gap: raise ValueError("The AGP file should have no gaps.") if line.orientation == "-": raise ValueError("No sequences should have a '-' orientation.") start, end = int(line.obj_beg) - 1, int(line.obj_end) print(">" + line.comp) print(fai.fetch(line.obj, start, end)) fai.close()
def main(): parser = argparse.ArgumentParser( description="Build sequences in FASTA format from an AGP v2.1 file.", usage="ragtag.py agp2fa <scaffolds.agp> <components.fasta>") parser.add_argument("agp", metavar="<scaffolds.agp>", nargs='?', default="", type=str, help="AGP v2.1 file") parser.add_argument( "components", metavar="<components.fasta>", nargs='?', default="", type=str, help="component FASTA file (can be uncompressed or bgzipped)") args = parser.parse_args() if not args.agp or not args.components: parser.print_help() sys.exit() agp_file = args.agp components_file = args.components fai = pysam.FastaFile(components_file) agp = AGPFile(agp_file, mode="r") # Iterate over the lines of the AGP file prev_obj = None is_first = True for agp_line in agp.iterate_lines(): if agp_line.obj != prev_obj: if is_first: print(">" + agp_line.obj) is_first = False else: print("\n>" + agp_line.obj) prev_obj = agp_line.obj if agp_line.is_gap: sys.stdout.write("N" * agp_line.gap_len) else: if agp_line.orientation == "-": sys.stdout.write( reverse_complement( fai.fetch(agp_line.comp, agp_line.comp_beg - 1, agp_line.comp_end))) else: sys.stdout.write( fai.fetch(agp_line.comp, agp_line.comp_beg - 1, agp_line.comp_end)) # End the FASTA file with a newline sys.stdout.write("\n") fai.close()
def main(): parser = argparse.ArgumentParser(description="Calculate scaffolding statistics") parser.add_argument("agp", nargs='?', default="", metavar="<ragtag.scaffolds.agp>", type=str, help="RagTag scaffolding AGP file") parser.add_argument("confidence", nargs='?', default="", metavar="<ragtag.confidence.txt>", type=str, help="RagTag scaffolding confidence scores file") args = parser.parse_args() if not args.agp or not args.confidence: parser.print_help() sys.exit() agp_file = args.agp confidence_file = args.confidence placed_bp = 0 placed_seq = 0 unplaced_bp = 0 unplaced_seq = 0 gap_bp = 0 gap_seq = 0 allowed_seq_types = {"A", "D", "F", "G", "O", "P", "W"} allowed_gap_types = {"N", "U"} # Get the set of placed sequences from the confidence scores file placed_seqs = set() with open(confidence_file, "r") as f: f.readline() # discard header for line in f: header, g_score, l_score, o_score = line.rstrip().split("\t") placed_seqs.add(header) # Iterate through the AGP file agp = AGPFile(agp_file, mode="r") for line in agp.iterate_lines(): if line.is_gap: gap_bp += line.gap_len gap_seq += 1 else: seq_len = line.comp_end - (line.comp_beg - 1) if line.comp in placed_seqs: placed_bp += seq_len placed_seq += 1 else: unplaced_bp += seq_len unplaced_seq += 1 print("placed_sequences\tplaced_bp\tunplaced_sequences\tunplaced_bp\tgap_bp\tgap_sequences") print("\t".join([ str(placed_seq), str(placed_bp), str(unplaced_seq), str(unplaced_bp), str(gap_bp), str(gap_seq) ]))
def sub_update(gff_file, agp_file): # Make a dictionary associating each original sequence with an interval tree of component sequences trans = defaultdict(IntervalTree) agp = AGPFile(agp_file, mode="r") for agp_line in agp.iterate_lines(): # Check that the agp file looks correct for this task if agp_line.orientation == "-": raise ValueError( "The placement BED file is not formatted correctly. No sequences should be reverse complemented for misassembly correction." ) if not agp_line.comp_type == "W": raise ValueError( "The placement BED file is not formatted correctly. All lines should be WGS contig (W)." ) if agp_line.is_gap: raise ValueError( "There should be no gaps in the correction AGP file.") start, end = agp_line.obj_beg - 1, agp_line.obj_end trans[agp_line.obj][start:end] = agp_line.comp # Iterate through the gff intervals and update them according to trans with open(gff_file, "r") as f: for line in f: line = line.rstrip() if line.startswith("#"): print(line) # Print this comment line else: fields = line.split("\t") h, s, e = fields[0], int(fields[3]), int(fields[4]) s -= 1 # Keep everything zero-indexed if h not in trans: raise ValueError("Inconsistent input files.") ovlps = trans[h][s:e] if len(ovlps) > 1: raise ValueError( "%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Make sure to run 'ragtag.py correct' with '--gff'" % (h, s, e)) if len(ovlps) < 1: raise ValueError( "The placement BED file is not formatted correctly.") # Get the data from the overlapping interval and print the new line o = list(ovlps)[0] new_s = s - o.begin new_e = e - o.begin fields[0] = o.data fields[3] = str(new_s + 1) # back to one-based indexing for gff format fields[4] = str(new_e) print("\t".join(fields))
def sup_update(gff_file, agp_file): # Make a dictionary associating each original sequence with the destination sequence trans = {} strands = {} seq_lens = {} agp = AGPFile(agp_file, mode="r") for agp_line in agp.iterate_lines(): if not agp_line.is_gap: start, end = agp_line.obj_beg - 1, agp_line.obj_end trans[agp_line.comp] = (start, end, agp_line.obj) strands[agp_line.comp] = agp_line.orientation seq_lens[agp_line.comp] = end - start # Iterate through the gff intervals and update them according to trans with open(gff_file, "r") as f: for line in f: line = line.rstrip() if line.startswith("#"): print(line) # Print this comment line else: fields = line.split("\t") h, s, e, st = fields[0], int(fields[3]), int( fields[4]), fields[6] s -= 1 # Keep everything zero-indexed if h not in trans: print() print(line) raise ValueError("Inconsistent input files.") # Check if the original sequence has been reverse complemented if strands[h] == "-": l = seq_lens[h] s, e = l - e, l - s if st == "+": st = "-" else: st = "+" new_s = trans[h][0] + s new_e = trans[h][0] + e fields[0] = trans[h][2] fields[3] = str(new_s + 1) # back to one-based indexing for gff format fields[4] = str(new_e) fields[6] = st print("\t".join(fields))
def main(): parser = argparse.ArgumentParser( description="Build sequences in FASTA format from an AGP v2.1 file. ") parser.add_argument("agp", metavar="<scaffolds.agp>", type=str, help="AGP v2.1 file") parser.add_argument( "components", metavar="<components.fasta>", type=str, help= "FASTA file with component sequences to be scaffolded. must not be gzipped" ) args = parser.parse_args() agp_file = args.agp components_file = args.components fai = pysam.FastaFile(components_file) agp = AGPFile(agp_file) # Iterate over the lines of the AGP file prev_obj = None is_first = True for agp_line in agp.iterate_lines(): if agp_line.obj != prev_obj: if is_first: print(">" + agp_line.obj) is_first = False else: print("\n>" + agp_line.obj) prev_obj = agp_line.obj if agp_line.is_gap: sys.stdout.write("N" * agp_line.gap_len) else: if agp_line.orientation == "-": sys.stdout.write(reverse_complement(fai.fetch(agp_line.comp))) else: sys.stdout.write(fai.fetch(agp_line.comp)) # End the FASTA file with a newline sys.stdout.write("\n")
def main(): parser = argparse.ArgumentParser(description="Rename FASTA records.", usage="ragtag_rename.py <seqs.fa> [-p PREFIX]") parser.add_argument("fasta_fn", metavar="<seqs.fa>", default="", type=str, help="FASTA file (uncompressed or bgzipped)") parser.add_argument("-p", metavar="STR", type=str, default="", help="prefix") parser.add_argument("-o", metavar="PATH", type=str, default="ragtag.rename.agp", help="output AGP file path [./ragtag.rename.agp]") args = parser.parse_args() fasta_fn = args.fasta_fn prefix = args.p agp_fn = args.o agp = AGPFile(agp_fn, "w") record_idx = 0 fai = pysam.FastaFile(fasta_fn) for reference in fai.references: agp.add_seq_line( prefix + "{0:08}".format(record_idx), 1, fai.get_reference_length(reference), "1", "W", reference, 1, fai.get_reference_length(reference), "+" ) print(">" + prefix + "{0:08}".format(record_idx)) print(fai.fetch(reference)) record_idx += 1 agp.write() fai.close()
def main(): parser = argparse.ArgumentParser(description="Check AGP v2.1 files for validity.", usage="ragtag.py agpcheck <asm1.agp> [<asm2.agp> ... <asmN.agp>]") parser.add_argument("agp", metavar="<asm1.agp> [<asm2.agp> ... <asmN.agp>]", nargs='+', default=[], type=str, help="AGP v2.1 files") DISCLAIMER = """ DISCLAIMER: This utility performs most (but not all) checks necessary to validate an AGP v2.1 file: https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Specification/ Please additionally use the NCBI AGP validator for robust validation: https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Validation/ """ args = parser.parse_args() print(DISCLAIMER) agp_file_list = [os.path.abspath(i) for i in args.agp] for agp_file in agp_file_list: print() log("INFO", "Checking {} ...".format(agp_file)) agp = AGPFile(agp_file, mode="r") for _ in agp.iterate_lines(): pass log("INFO", "Check for {} is complete with no errors.".format(agp_file))
def write_breaks(out_file, query_file, ctg_breaks, overwrite, remove_suffix): """ Write the intermediate file for contig breaks in AGP v2.1 format.""" # Check if the output file already exists if os.path.isfile(out_file): if not overwrite: log("Retaining pre-existing file: " + out_file) return else: log("Overwriting pre-existing file: " + out_file) x = pysam.FastaFile(query_file) all_q_seqs = sorted(x.references) agp = AGPFile(out_file, "w") agp.add_comment("## agp-version 2.1") agp.add_comment("# AGP created by RagTag") for q in all_q_seqs: # Check if this sequence was broken during misassembly correction if q not in ctg_breaks: # Add suffix to query header, unless otherwise requested unchanged_comp_header = q if not remove_suffix: unchanged_comp_header = q + ":0" + "-" + str( x.get_reference_length(q)) + "(+)" agp.add_seq_line(q, "1", str(x.get_reference_length(q)), "1", "W", unchanged_comp_header, "1", str(x.get_reference_length(q)), "+") else: # This query sequence was broken pid = 1 sorted_breaks = sorted(ctg_breaks[q]) start = 0 for i in sorted_breaks: agp.add_seq_line(q, str(start + 1), str(i), str(pid), "W", q + ":" + str(start) + "-" + str(i) + "(+)", "1", str(i - start), "+") start = i pid += 1 # Add one line for the last interval agp.add_seq_line( q, str(start + 1), str(x.get_reference_length(q)), str(pid), "W", q + ":" + str(start) + "-" + str(x.get_reference_length(q)) + "(+)", "1", str(x.get_reference_length(q) - start), "+") log("Writing: " + out_file) agp.write()
def write_agp(self, agp_fn, ref_fn, add_suffix_to_unplaced=False): """ Write the AGP file implied by the scaffold graph :param agp_fn: AGP file name :param ref_fn: reference FASTA file name :param add_suffix_to_unplaced: add "_RagTag" to unscaffolded sequences """ used_components = set() used_edges = set() obj_header_idx = -1 agp = AGPFile(agp_fn, "w") agp.add_pragma() agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version())) while True: # Find a starting node from_node = None to_node = None cur_ref = None for u, v in sorted(self.edges): if (u, v) not in used_edges: u_base = u[:-2] u_degree = 0 if u_base + "_b" in self.nodes: u_degree += self.graph.degree[u_base + "_b"] if u_base + "_e" in self.nodes: u_degree += self.graph.degree[u_base + "_e"] assert u_degree in {2, 4} # Check if we have found a starting target sequence if u_degree == 2: cur_ref = u_base from_node = u to_node = v used_edges.add((u, v)) used_edges.add((v, u)) break # If we haven't found a new starting target sequence, we are done if from_node is None: break # Initialize this object obj_header_idx += 1 obj_header = "scf" + "{0:08}".format(obj_header_idx) obj_pos = 0 obj_pid = 1 # Process the first target sequence cur_ref_len = self.component_lens[cur_ref] cur_ref_strand = "+" if from_node.endswith("_b"): cur_ref_strand = "-" agp.add_seq_line(obj_header, obj_pos+1, obj_pos+cur_ref_len, obj_pid, "W", cur_ref, 1, cur_ref_len, cur_ref_strand) obj_pos += cur_ref_len obj_pid += 1 used_components.add(cur_ref) # Process the remaining sequences. next_edge_exists = True while next_edge_exists: # Process the patch patch_aln = self.graph[from_node][to_node]["alignment"] patch_query = patch_aln.query patch_strand = "+" if patch_aln.strand: patch_strand = "-" patch_len = patch_aln.their_query_start - patch_aln.my_query_end if patch_len > 0: if patch_aln.is_gap: agp.add_gap_line(obj_header, obj_pos+1, obj_pos+patch_len, obj_pid, "N", patch_len, "scaffold", "yes", "align_genus") else: agp.add_seq_line(obj_header, obj_pos+1, obj_pos+patch_len, obj_pid, "W", patch_query, patch_aln.my_query_end+1, patch_aln.their_query_start, patch_strand) used_components.add(patch_query) obj_pos += patch_len obj_pid += 1 # Next, process the reference sequence comp_start = min(0, patch_len) cur_ref = to_node[:-2] cur_ref_len = self.component_lens[cur_ref] cur_ref_strand = "+" if to_node.endswith("_e"): cur_ref_strand = "-" agp.add_seq_line(obj_header, obj_pos+1, obj_pos+(cur_ref_len + comp_start), obj_pid, "W", cur_ref, 1+(-1*comp_start), cur_ref_len, cur_ref_strand) obj_pos += cur_ref_len + comp_start obj_pid += 1 used_components.add(cur_ref) # Look for the next edge from_node = to_node[:-2] + "_b" if to_node.endswith("_b"): from_node = to_node[:-2] + "_e" if from_node in self.graph.nodes: next_nodes = set(self.graph[from_node]) assert len(next_nodes) == 1 to_node = next_nodes.pop() used_edges.add((from_node, to_node)) used_edges.add((to_node, from_node)) else: next_edge_exists = False # Write unplaced reference sequences fai = pysam.FastaFile(ref_fn) all_ref_seqs = set(fai.references) fai.close() remaining_components = all_ref_seqs - used_components for c in sorted(remaining_components): agp.add_seq_line( c + "_RagTag" * add_suffix_to_unplaced, "1", str(self.component_lens[c]), "1", "W", c, "1", str(self.component_lens[c]), "+" ) agp.write()
def _get_assembly_points(self, agp, weight): """ Find all adjacencies defined in an AGP file :param agp: An AGP file defining sequence adjacencies :param weight: The weight to assign to each adjacency """ comps = set() prev_obj = "" seq1 = "" strand1 = "" # Gap info gap_count = 0 prev_agp_known = None prev_gap_size = 0 prev_gap_type = None prev_linkage = "" prev_evidence = "" # Iterate over the AGP file and yield assembly points agp_file = AGPFile(agp) for agp_line in agp_file.iterate_lines(): if not agp_line.is_gap: # Add this component to our master list if agp_line.comp not in self.component_lens: raise RuntimeError("{} is in {} but not {}.".format(agp_line.comp, agp, self.components_fasta_fname)) comps.add(agp_line.comp) comp_len = agp_line.comp_end if comp_len < self.get_component_len(agp_line.comp): raise RuntimeError("only complete components can be added to the graph.") if comp_len > self.get_component_len(agp_line.comp): raise RuntimeError("inconsistent component lengths: {} bp in {} and {} bp in {}". format(comp_len, agp, self.get_component_len(agp_line.comp), self.components_fasta_fname)) if agp_line.obj == prev_obj: # Check if these components are bookended (no gap in between) if not gap_count: prev_evidence = "bookend" # Check if two consecutive gaps preceded this component if gap_count > 1: raise ValueError("Consecutive gaps in the AGP file are not currently supported.") yield AssemblyPoint( seq1, strand1, agp_line.comp, agp_line.orientation, weight, agp, prev_agp_known, prev_gap_size, prev_gap_type, prev_linkage, prev_evidence ) # Set this component as the previous component seq1 = agp_line.comp strand1 = agp_line.orientation gap_count = 0 prev_agp_known = None prev_gap_size = 0 prev_gap_type = None prev_linkage = "" prev_evidence = "" else: seq1 = agp_line.comp strand1 = agp_line.orientation prev_obj = agp_line.obj gap_count = 0 prev_agp_known = None prev_gap_size = 0 prev_gap_type = None prev_linkage = "" prev_evidence = "" else: if agp_line.obj == prev_obj: gap_count += 1 prev_agp_known = True if agp_line.comp_type == "N" else False prev_gap_size = agp_line.gap_len prev_gap_type = agp_line.gap_type prev_linkage = True if agp_line.linkage == "yes" else False prev_evidence = agp_line.linkage_evidence if comps != self.components: raise ValueError("Input AGPs do not have the same set of components.")
def write_orderings(out_agp_file, out_confidence_file, query_file, ordering_dict, ctg_dict, gap_dict, gap_type_dict, make_chr0, overwrite, add_suffix): # Check if the output file already exists if os.path.isfile(out_agp_file): if not overwrite: log("Retaining pre-existing file: " + out_agp_file) return else: log("Overwriting pre-existing file: " + out_agp_file) # Proceed with writing the intermediate output placed_seqs = set() all_out_cs_lines = [] # For confidence scores agp = AGPFile(out_agp_file, mode="w") agp.add_pragma() agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version())) # Go through the reference sequences in sorted order sorted_ref_headers = sorted(list(ordering_dict.keys())) for ref_header in sorted_ref_headers: pid = 1 pos = 0 new_ref_header = ref_header + "_RagTag" q_seqs = ordering_dict[ref_header] gap_seqs = gap_dict[ref_header] gap_types = gap_type_dict[ref_header] # Iterate through the query sequences for this reference header for i in range(len(q_seqs)): out_agp_line = [] out_cs_line = [] q = q_seqs[i][2] placed_seqs.add(q) qlen = ctg_dict[q].query_len strand = ctg_dict[q].orientation gc, lc, oc = ctg_dict[q].grouping_confidence, ctg_dict[ q].location_confidence, ctg_dict[q].orientation_confidence out_agp_line.append(new_ref_header) out_agp_line.append(str(pos + 1)) pos += qlen out_agp_line.append(str(pos)) out_agp_line.append(str(pid)) out_agp_line.append("W") out_agp_line.append(q) out_agp_line.append("1") out_agp_line.append(str(ctg_dict[q].query_len)) out_agp_line.append(strand) # Save the confidence score info out_cs_line.append(q) out_cs_line.append(str(gc)) out_cs_line.append(str(lc)) out_cs_line.append(str(oc)) agp.add_seq_line(*out_agp_line) all_out_cs_lines.append("\t".join(out_cs_line)) pid += 1 if i < len(gap_seqs): # Print the gap line out_agp_line = [] out_agp_line.append(new_ref_header) out_agp_line.append(str(pos + 1)) pos += gap_seqs[i] out_agp_line.append(str(pos)) out_agp_line.append(str(pid)) gap_type = gap_types[i] out_agp_line.append(gap_type) out_agp_line.append(str(gap_seqs[i])) out_agp_line.append("scaffold") out_agp_line.append("yes") out_agp_line.append("align_genus") pid += 1 agp.add_gap_line(*out_agp_line) # Write unplaced sequences fai = pysam.FastaFile(query_file) all_seqs = set(fai.references) unplaced_seqs = sorted(list(all_seqs - placed_seqs)) if unplaced_seqs: if make_chr0: pos = 0 pid = 1 new_ref_header = "Chr0_RagTag" for q in unplaced_seqs: out_agp_line = [] qlen = fai.get_reference_length(q) out_agp_line.append(new_ref_header) out_agp_line.append(str(pos + 1)) pos += qlen out_agp_line.append(str(pos)) out_agp_line.append(str(pid)) out_agp_line.append("W") out_agp_line.append(q) out_agp_line.append("1") out_agp_line.append(str(qlen)) out_agp_line.append("+") agp.add_seq_line(*out_agp_line) pid += 1 # Now for the gap, since we are making a chr0 out_agp_line = [] out_agp_line.append(new_ref_header) out_agp_line.append(str(pos + 1)) pos += 100 out_agp_line.append(str(pos)) out_agp_line.append(str(pid)) out_agp_line.append("U") out_agp_line.append("100") out_agp_line.append("contig") out_agp_line.append("no") out_agp_line.append("na") agp.add_gap_line(*out_agp_line) pid += 1 # Remove the final unecessary gap agp.pop_agp_line() else: # List the unplaced contigs individually for q in unplaced_seqs: out_agp_line = [] qlen = fai.get_reference_length(q) if add_suffix: out_agp_line.append(q + "_RagTag") else: out_agp_line.append(q) out_agp_line.append("1") out_agp_line.append(str(qlen)) out_agp_line.append("1") out_agp_line.append("W") out_agp_line.append(q) out_agp_line.append("1") out_agp_line.append(str(qlen)) out_agp_line.append("+") agp.add_seq_line(*out_agp_line) agp.write() fai.close() # Write the confidence scores with open(out_confidence_file, "w") as f: f.write( "query\tgrouping_confidence\tlocation_confidence\torientation_confidence\n" ) f.write("\n".join(all_out_cs_lines) + "\n")
def write_agp_solution(cover_graph, scaffold_graph, agp_fname, gap_func="MIN", add_suffix_to_unplaced=False): """ Here, we work with two graphs: A cover_graph and a scaffold_graph. A covergrpah defines a solution to the scaffold graph, and nodes from the same component are connected for convenience. We use the scaffold_graph for any original scaffold_graph info/functionality """ if not isinstance(scaffold_graph, ScaffoldGraphBase): raise TypeError("scaffold_graph must be an instance of ScaffoldGraph") placed_components = set() # Iterate over each connected component agp = AGPFile(agp_fname, mode="w") agp.add_pragma() agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version())) # Iterate through the connected components for i, cc in enumerate(nx.connected_components(G=cover_graph)): # Sort the list of nodes for deterministic output cc = sorted(list(cc)) obj_header = "scf" + "{0:08}".format(i+1) + "_RagTag" current_node = None # Iterate over each node in the connected component until we find a node with degree=1 for node in cc: if cover_graph.degree[node] == 1: current_node = node break assert current_node is not None # Starting with the degree=1 node, build the AGP object from nodes in the path. visited_nodes = {current_node} degree = 0 obj_id = 1 obj_pos = 0 # Traverse the component until we find the other end node while degree != 1: conn_nodes = set(cover_graph.neighbors(current_node)) next_node = (conn_nodes - visited_nodes).pop() degree = cover_graph.degree[next_node] comp_len = scaffold_graph.get_component_len(next_node[:-2]) # Check if this is an intra or inter sequence edge orientation = "+" if next_node[:-2] == current_node[:-2]: if next_node.endswith("_b"): orientation = "-" assert current_node.endswith("_e") agp.add_seq_line( obj_header, str(obj_pos + 1), str(obj_pos + comp_len), str(obj_id), "W", next_node[:-2], "1", str(comp_len), orientation ) obj_pos += comp_len placed_components.add(next_node[:-2]) else: # Organize the gap info adjacency_data = scaffold_graph[current_node][next_node] # AGP Column 5 all_is_known_gap_size = adjacency_data["is_known_gap_size"] comp_type = "N" if any(all_is_known_gap_size) else "U" # AGP column 6b gap_size = 100 all_gap_sizes = adjacency_data["gap_size"] fltrd_gap_sizes = [all_gap_sizes[i] for i in range(len(all_gap_sizes)) if all_is_known_gap_size[i]] if fltrd_gap_sizes: if len(fltrd_gap_sizes) == 1: gap_size = fltrd_gap_sizes[0] else: gap_size = get_gap_size(fltrd_gap_sizes, gap_func) # AGP column 7b all_gap_types = set(adjacency_data["gap_type"]) gap_type = "scaffold" if len(all_gap_types) == 1: gap_type = all_gap_types.pop() # AGP column 8b has_linkage = "yes" if any(adjacency_data["linkage"]) else "no" # AGP column 9b all_evidences = set(adjacency_data["linkage_evidence"]) linkage_evidence = "na" if has_linkage == "yes": if "na" in all_evidences: all_evidences.remove("na") linkage_evidence = ";".join([str(i) for i in all_evidences]) agp.add_gap_line( obj_header, str(obj_pos + 1), str(obj_pos + gap_size), str(obj_id), comp_type, str(gap_size), gap_type, has_linkage, linkage_evidence ) obj_pos += gap_size obj_id += 1 visited_nodes.add(next_node) current_node = next_node # Write all unplaced contigs remaining_components = scaffold_graph.components - placed_components for c in remaining_components: agp.add_seq_line( c + "_RagTag" * add_suffix_to_unplaced, "1", str(scaffold_graph.get_component_len(c)), "1", "W", c, "1", str(scaffold_graph.get_component_len(c)), "+" ) agp.write()
def lens_from_agp(fname): agp_file = AGPFile(fname, mode="r") return [obj.obj_len for obj in agp_file.iterate_objs()]
def main(): parser = argparse.ArgumentParser(description='Split sequencs at gaps', usage="ragtag.py splitasm <asm.fa>") parser.add_argument("asm", metavar="<asm.fa>", default="", type=str, help="assembly fasta file (uncompressed or bgzipped)") parser.add_argument("-n", metavar="INT", type=int, default=0, help="minimum gap size [0]") parser.add_argument("-o", metavar="PATH", type=str, default="ragtag.splitasm.agp", help="output AGP file path [./ragtag.splitasm.agp]") # Parse the command line arguments args = parser.parse_args() if not args.asm: parser.print_help() print("\n** The assembly FASTA file is required **") sys.exit() asm_fn = args.asm min_gap_size = args.n agp_fn = args.o # Initialize the AGP file agp = AGPFile(agp_fn, mode="w") agp.add_pragma() agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version())) # Process the FASTA file new_header_idx = 0 fai = pysam.FastaFile(asm_fn) for header in sorted(fai.references): seq = fai.fetch(header).upper() seq_len = fai.get_reference_length(header) gap_coords = [(i.start(), i.end()) for i in re.finditer(r'N+', seq) if i.end() - i.start() > min_gap_size] if not gap_coords: new_header = "seq{0:08}".format(new_header_idx) new_header_idx += 1 agp.add_seq_line(header, "1", seq_len, "1", "W", new_header, "1", seq_len, "+") else: gap_coords.append((seq_len, seq_len + 1)) pid = 1 if gap_coords[0][0]: # The sequence doesn't start with a gap new_header = "seq{0:08}".format(new_header_idx) agp.add_seq_line(header, "1", str(gap_coords[0][0]), str(pid), "W", new_header, "1", str(gap_coords[0][0]), "+") new_header_idx += 1 pid += 1 for i in range(1, len(gap_coords)): # Add the gap line gap_start, gap_end = gap_coords[i - 1][0], gap_coords[i - 1][1] gap_len = gap_end - gap_start agp.add_gap_line(header, str(gap_start + 1), str(gap_end), str(pid), "N", str(gap_len), "scaffold", "yes", "align_genus") pid += 1 # Add the sequence line obj_start, obj_end = gap_coords[i - 1][1], gap_coords[i][0] comp_len = obj_end - obj_start new_header = "seq{0:08}".format(new_header_idx) if gap_coords[i - 1][1] != seq_len: agp.add_seq_line(header, str(obj_start + 1), obj_end, pid, "W", new_header, "1", str(comp_len), "+") new_header_idx += 1 pid += 1 agp.write() # Iterate over the AGP file and print the sequences agp = AGPFile(agp_fn, mode="r") for line in agp.iterate_lines(): if not line.is_gap: obj, comp, obj_beg, obj_end = line.obj, line.comp, line.obj_beg, line.obj_end print(">" + comp) print(fai.fetch(obj, obj_beg - 1, obj_end)) fai.close()