Ejemplo n.º 1
0
def write_breaks(out_file, query_file, ctg_breaks, overwrite, remove_suffix):
    """ Write the intermediate file for contig breaks in AGP v2.1 format."""
    # Check if the output file already exists
    if os.path.isfile(out_file):
        if not overwrite:
            log("Retaining pre-existing file: " + out_file)
            return

        else:
            log("Overwriting pre-existing file: " + out_file)

    fai = pysam.FastaFile(query_file)
    all_q_seqs = sorted(fai.references)
    agp = AGPFile(out_file, mode="w")

    agp.add_pragma()
    agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version()))

    for q in all_q_seqs:

        # Check if this sequence was broken during misassembly correction
        if q not in ctg_breaks:

            # Add suffix to query header, unless otherwise requested
            unchanged_comp_header = q
            if not remove_suffix:
                unchanged_comp_header = q + ":0" + "-" + str(
                    fai.get_reference_length(q)) + "(+)"

            agp.add_seq_line(q, "1", str(fai.get_reference_length(q)), "1",
                             "W", unchanged_comp_header, "1",
                             str(fai.get_reference_length(q)), "+")
        else:  # This query sequence was broken
            pid = 1
            sorted_breaks = sorted(ctg_breaks[q])
            start = 0
            for i in sorted_breaks:
                agp.add_seq_line(q, str(start + 1), str(i), str(pid), "W",
                                 q + ":" + str(start) + "-" + str(i) + "(+)",
                                 "1", str(i - start), "+")
                start = i
                pid += 1

            # Add one line for the last interval
            agp.add_seq_line(
                q, str(start + 1), str(fai.get_reference_length(q)), str(pid),
                "W", q + ":" + str(start) + "-" +
                str(fai.get_reference_length(q)) + "(+)", "1",
                str(fai.get_reference_length(q) - start), "+")

    log("Writing: " + out_file)
    agp.write()
    fai.close()
Ejemplo n.º 2
0
    def write_agp(self, agp_fn, ref_fn, add_suffix_to_unplaced=False):
        """
        Write the AGP file implied by the scaffold graph
        :param agp_fn: AGP file name
        :param ref_fn: reference FASTA file name
        :param add_suffix_to_unplaced: add "_RagTag" to unscaffolded sequences
        """
        used_components = set()
        used_edges = set()
        obj_header_idx = -1

        agp = AGPFile(agp_fn, "w")
        agp.add_pragma()
        agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version()))

        while True:
            # Find a starting node
            from_node = None
            to_node = None
            cur_ref = None
            for u, v in sorted(self.edges):
                if (u, v) not in used_edges:
                    u_base = u[:-2]

                    u_degree = 0
                    if u_base + "_b" in self.nodes:
                        u_degree += self.graph.degree[u_base + "_b"]
                    if u_base + "_e" in self.nodes:
                        u_degree += self.graph.degree[u_base + "_e"]

                    assert u_degree in {2, 4}

                    # Check if we have found a starting target sequence
                    if u_degree == 2:
                        cur_ref = u_base
                        from_node = u
                        to_node = v
                        used_edges.add((u, v))
                        used_edges.add((v, u))
                        break

            # If we haven't found a new starting target sequence, we are done
            if from_node is None:
                break

            # Initialize this object
            obj_header_idx += 1
            obj_header = "scf" + "{0:08}".format(obj_header_idx)
            obj_pos = 0
            obj_pid = 1

            # Process the first target sequence
            cur_ref_len = self.component_lens[cur_ref]
            cur_ref_strand = "+"
            if from_node.endswith("_b"):
                cur_ref_strand = "-"
            agp.add_seq_line(obj_header, obj_pos+1, obj_pos+cur_ref_len, obj_pid, "W", cur_ref, 1, cur_ref_len, cur_ref_strand)
            obj_pos += cur_ref_len
            obj_pid += 1
            used_components.add(cur_ref)

            # Process the remaining sequences.
            next_edge_exists = True
            while next_edge_exists:
                # Process the patch
                patch_aln = self.graph[from_node][to_node]["alignment"]
                patch_query = patch_aln.query
                patch_strand = "+"
                if patch_aln.strand:
                    patch_strand = "-"

                patch_len = patch_aln.their_query_start - patch_aln.my_query_end
                if patch_len > 0:
                    if patch_aln.is_gap:
                        agp.add_gap_line(obj_header, obj_pos+1, obj_pos+patch_len, obj_pid, "N", patch_len, "scaffold", "yes", "align_genus")
                    else:
                        agp.add_seq_line(obj_header, obj_pos+1, obj_pos+patch_len, obj_pid, "W", patch_query, patch_aln.my_query_end+1, patch_aln.their_query_start, patch_strand)
                        used_components.add(patch_query)
                    obj_pos += patch_len
                    obj_pid += 1

                # Next, process the reference sequence
                comp_start = min(0, patch_len)
                cur_ref = to_node[:-2]
                cur_ref_len = self.component_lens[cur_ref]
                cur_ref_strand = "+"
                if to_node.endswith("_e"):
                    cur_ref_strand = "-"
                agp.add_seq_line(obj_header, obj_pos+1, obj_pos+(cur_ref_len + comp_start), obj_pid, "W", cur_ref, 1+(-1*comp_start), cur_ref_len, cur_ref_strand)
                obj_pos += cur_ref_len + comp_start
                obj_pid += 1
                used_components.add(cur_ref)

                # Look for the next edge
                from_node = to_node[:-2] + "_b"
                if to_node.endswith("_b"):
                    from_node = to_node[:-2] + "_e"

                if from_node in self.graph.nodes:
                    next_nodes = set(self.graph[from_node])
                    assert len(next_nodes) == 1
                    to_node = next_nodes.pop()
                    used_edges.add((from_node, to_node))
                    used_edges.add((to_node, from_node))
                else:
                    next_edge_exists = False

        # Write unplaced reference sequences
        fai = pysam.FastaFile(ref_fn)
        all_ref_seqs = set(fai.references)
        fai.close()
        remaining_components = all_ref_seqs - used_components
        for c in sorted(remaining_components):
            agp.add_seq_line(
                c + "_RagTag" * add_suffix_to_unplaced,
                "1",
                str(self.component_lens[c]),
                "1",
                "W",
                c,
                "1",
                str(self.component_lens[c]),
                "+"
            )

        agp.write()
Ejemplo n.º 3
0
def write_orderings(out_agp_file, out_confidence_file, query_file,
                    ordering_dict, ctg_dict, gap_dict, gap_type_dict,
                    make_chr0, overwrite, add_suffix):
    # Check if the output file already exists
    if os.path.isfile(out_agp_file):
        if not overwrite:
            log("Retaining pre-existing file: " + out_agp_file)
            return

        else:
            log("Overwriting pre-existing file: " + out_agp_file)

    # Proceed with writing the intermediate output
    placed_seqs = set()
    all_out_cs_lines = []  # For confidence scores
    agp = AGPFile(out_agp_file, mode="w")

    agp.add_pragma()
    agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version()))

    # Go through the reference sequences in sorted order
    sorted_ref_headers = sorted(list(ordering_dict.keys()))
    for ref_header in sorted_ref_headers:
        pid = 1
        pos = 0
        new_ref_header = ref_header + "_RagTag"
        q_seqs = ordering_dict[ref_header]
        gap_seqs = gap_dict[ref_header]
        gap_types = gap_type_dict[ref_header]

        # Iterate through the query sequences for this reference header
        for i in range(len(q_seqs)):
            out_agp_line = []
            out_cs_line = []
            q = q_seqs[i][2]
            placed_seqs.add(q)
            qlen = ctg_dict[q].query_len
            strand = ctg_dict[q].orientation
            gc, lc, oc = ctg_dict[q].grouping_confidence, ctg_dict[
                q].location_confidence, ctg_dict[q].orientation_confidence
            out_agp_line.append(new_ref_header)
            out_agp_line.append(str(pos + 1))
            pos += qlen
            out_agp_line.append(str(pos))
            out_agp_line.append(str(pid))
            out_agp_line.append("W")
            out_agp_line.append(q)
            out_agp_line.append("1")
            out_agp_line.append(str(ctg_dict[q].query_len))
            out_agp_line.append(strand)

            # Save the confidence score info
            out_cs_line.append(q)
            out_cs_line.append(str(gc))
            out_cs_line.append(str(lc))
            out_cs_line.append(str(oc))

            agp.add_seq_line(*out_agp_line)
            all_out_cs_lines.append("\t".join(out_cs_line))
            pid += 1

            if i < len(gap_seqs):
                # Print the gap line
                out_agp_line = []
                out_agp_line.append(new_ref_header)
                out_agp_line.append(str(pos + 1))
                pos += gap_seqs[i]
                out_agp_line.append(str(pos))
                out_agp_line.append(str(pid))
                gap_type = gap_types[i]
                out_agp_line.append(gap_type)
                out_agp_line.append(str(gap_seqs[i]))
                out_agp_line.append("scaffold")
                out_agp_line.append("yes")
                out_agp_line.append("align_genus")
                pid += 1
                agp.add_gap_line(*out_agp_line)

    # Write unplaced sequences
    fai = pysam.FastaFile(query_file)
    all_seqs = set(fai.references)
    unplaced_seqs = sorted(list(all_seqs - placed_seqs))
    if unplaced_seqs:
        if make_chr0:
            pos = 0
            pid = 1
            new_ref_header = "Chr0_RagTag"
            for q in unplaced_seqs:
                out_agp_line = []
                qlen = fai.get_reference_length(q)
                out_agp_line.append(new_ref_header)
                out_agp_line.append(str(pos + 1))
                pos += qlen
                out_agp_line.append(str(pos))
                out_agp_line.append(str(pid))
                out_agp_line.append("W")
                out_agp_line.append(q)
                out_agp_line.append("1")
                out_agp_line.append(str(qlen))
                out_agp_line.append("+")

                agp.add_seq_line(*out_agp_line)
                pid += 1

                # Now for the gap, since we are making a chr0
                out_agp_line = []
                out_agp_line.append(new_ref_header)
                out_agp_line.append(str(pos + 1))
                pos += 100
                out_agp_line.append(str(pos))
                out_agp_line.append(str(pid))
                out_agp_line.append("U")
                out_agp_line.append("100")
                out_agp_line.append("contig")
                out_agp_line.append("no")
                out_agp_line.append("na")

                agp.add_gap_line(*out_agp_line)
                pid += 1

            # Remove the final unecessary gap
            agp.pop_agp_line()
        else:
            # List the unplaced contigs individually
            for q in unplaced_seqs:
                out_agp_line = []
                qlen = fai.get_reference_length(q)
                if add_suffix:
                    out_agp_line.append(q + "_RagTag")
                else:
                    out_agp_line.append(q)
                out_agp_line.append("1")
                out_agp_line.append(str(qlen))
                out_agp_line.append("1")
                out_agp_line.append("W")
                out_agp_line.append(q)
                out_agp_line.append("1")
                out_agp_line.append(str(qlen))
                out_agp_line.append("+")
                agp.add_seq_line(*out_agp_line)

    agp.write()
    fai.close()

    # Write the confidence scores
    with open(out_confidence_file, "w") as f:
        f.write(
            "query\tgrouping_confidence\tlocation_confidence\torientation_confidence\n"
        )
        f.write("\n".join(all_out_cs_lines) + "\n")
Ejemplo n.º 4
0
def write_agp_solution(cover_graph, scaffold_graph, agp_fname, gap_func="MIN", add_suffix_to_unplaced=False):
    """
    Here, we work with two graphs: A cover_graph and a scaffold_graph. A covergrpah defines a solution to the scaffold
    graph, and nodes from the same component are connected for convenience.

    We use the scaffold_graph for any original scaffold_graph info/functionality
    """
    if not isinstance(scaffold_graph, ScaffoldGraphBase):
        raise TypeError("scaffold_graph must be an instance of ScaffoldGraph")

    placed_components = set()

    # Iterate over each connected component
    agp = AGPFile(agp_fname, mode="w")
    agp.add_pragma()
    agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version()))

    # Iterate through the connected components
    for i, cc in enumerate(nx.connected_components(G=cover_graph)):
        # Sort the list of nodes for deterministic output
        cc = sorted(list(cc))
        obj_header = "scf" + "{0:08}".format(i+1) + "_RagTag"
        current_node = None

        # Iterate over each node in the connected component until we find a node with degree=1
        for node in cc:
            if cover_graph.degree[node] == 1:
                current_node = node
                break

        assert current_node is not None

        # Starting with the degree=1 node, build the AGP object from nodes in the path.
        visited_nodes = {current_node}
        degree = 0
        obj_id = 1
        obj_pos = 0

        # Traverse the component until we find the other end node
        while degree != 1:
            conn_nodes = set(cover_graph.neighbors(current_node))
            next_node = (conn_nodes - visited_nodes).pop()
            degree = cover_graph.degree[next_node]
            comp_len = scaffold_graph.get_component_len(next_node[:-2])

            # Check if this is an intra or inter sequence edge
            orientation = "+"
            if next_node[:-2] == current_node[:-2]:
                if next_node.endswith("_b"):
                    orientation = "-"
                    assert current_node.endswith("_e")

                agp.add_seq_line(
                    obj_header,
                    str(obj_pos + 1),
                    str(obj_pos + comp_len),
                    str(obj_id),
                    "W",
                    next_node[:-2],
                    "1",
                    str(comp_len),
                    orientation
                )
                obj_pos += comp_len
                placed_components.add(next_node[:-2])
            else:
                # Organize the gap info
                adjacency_data = scaffold_graph[current_node][next_node]

                # AGP Column 5
                all_is_known_gap_size = adjacency_data["is_known_gap_size"]
                comp_type = "N" if any(all_is_known_gap_size) else "U"

                # AGP column 6b
                gap_size = 100
                all_gap_sizes = adjacency_data["gap_size"]
                fltrd_gap_sizes = [all_gap_sizes[i] for i in range(len(all_gap_sizes)) if all_is_known_gap_size[i]]
                if fltrd_gap_sizes:
                    if len(fltrd_gap_sizes) == 1:
                        gap_size = fltrd_gap_sizes[0]
                    else:
                        gap_size = get_gap_size(fltrd_gap_sizes, gap_func)

                # AGP column 7b
                all_gap_types = set(adjacency_data["gap_type"])
                gap_type = "scaffold"
                if len(all_gap_types) == 1:
                    gap_type = all_gap_types.pop()

                # AGP column 8b
                has_linkage = "yes" if any(adjacency_data["linkage"]) else "no"

                # AGP column 9b
                all_evidences = set(adjacency_data["linkage_evidence"])
                linkage_evidence = "na"
                if has_linkage == "yes":
                    if "na" in all_evidences:
                        all_evidences.remove("na")
                    linkage_evidence = ";".join([str(i) for i in all_evidences])

                agp.add_gap_line(
                    obj_header,
                    str(obj_pos + 1),
                    str(obj_pos + gap_size),
                    str(obj_id),
                    comp_type,
                    str(gap_size),
                    gap_type,
                    has_linkage,
                    linkage_evidence
                )
                obj_pos += gap_size

            obj_id += 1
            visited_nodes.add(next_node)
            current_node = next_node

    # Write all unplaced contigs
    remaining_components = scaffold_graph.components - placed_components
    for c in remaining_components:
        agp.add_seq_line(
            c + "_RagTag" * add_suffix_to_unplaced,
            "1",
            str(scaffold_graph.get_component_len(c)),
            "1",
            "W",
            c,
            "1",
            str(scaffold_graph.get_component_len(c)),
            "+"
        )

    agp.write()
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(description='Split sequencs at gaps',
                                     usage="ragtag.py splitasm <asm.fa>")
    parser.add_argument("asm",
                        metavar="<asm.fa>",
                        default="",
                        type=str,
                        help="assembly fasta file (uncompressed or bgzipped)")
    parser.add_argument("-n",
                        metavar="INT",
                        type=int,
                        default=0,
                        help="minimum gap size [0]")
    parser.add_argument("-o",
                        metavar="PATH",
                        type=str,
                        default="ragtag.splitasm.agp",
                        help="output AGP file path [./ragtag.splitasm.agp]")

    # Parse the command line arguments
    args = parser.parse_args()
    if not args.asm:
        parser.print_help()
        print("\n** The assembly FASTA file is required **")
        sys.exit()

    asm_fn = args.asm
    min_gap_size = args.n
    agp_fn = args.o

    # Initialize the AGP file
    agp = AGPFile(agp_fn, mode="w")
    agp.add_pragma()
    agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version()))

    # Process the FASTA file
    new_header_idx = 0
    fai = pysam.FastaFile(asm_fn)
    for header in sorted(fai.references):
        seq = fai.fetch(header).upper()
        seq_len = fai.get_reference_length(header)
        gap_coords = [(i.start(), i.end()) for i in re.finditer(r'N+', seq)
                      if i.end() - i.start() > min_gap_size]

        if not gap_coords:
            new_header = "seq{0:08}".format(new_header_idx)
            new_header_idx += 1
            agp.add_seq_line(header, "1", seq_len, "1", "W", new_header, "1",
                             seq_len, "+")
        else:
            gap_coords.append((seq_len, seq_len + 1))
            pid = 1
            if gap_coords[0][0]:
                # The sequence doesn't start with a gap
                new_header = "seq{0:08}".format(new_header_idx)
                agp.add_seq_line(header, "1", str(gap_coords[0][0]),
                                 str(pid), "W", new_header, "1",
                                 str(gap_coords[0][0]), "+")
                new_header_idx += 1
                pid += 1

            for i in range(1, len(gap_coords)):
                # Add the gap line
                gap_start, gap_end = gap_coords[i - 1][0], gap_coords[i - 1][1]
                gap_len = gap_end - gap_start
                agp.add_gap_line(header, str(gap_start + 1), str(gap_end),
                                 str(pid), "N", str(gap_len), "scaffold",
                                 "yes", "align_genus")
                pid += 1

                # Add the sequence line
                obj_start, obj_end = gap_coords[i - 1][1], gap_coords[i][0]
                comp_len = obj_end - obj_start
                new_header = "seq{0:08}".format(new_header_idx)
                if gap_coords[i - 1][1] != seq_len:
                    agp.add_seq_line(header, str(obj_start + 1), obj_end, pid,
                                     "W", new_header, "1", str(comp_len), "+")
                    new_header_idx += 1
                    pid += 1

        agp.write()

    # Iterate over the AGP file and print the sequences
    agp = AGPFile(agp_fn, mode="r")
    for line in agp.iterate_lines():
        if not line.is_gap:
            obj, comp, obj_beg, obj_end = line.obj, line.comp, line.obj_beg, line.obj_end
            print(">" + comp)
            print(fai.fetch(obj, obj_beg - 1, obj_end))

    fai.close()