def main():
    parser = argparse.ArgumentParser(
        description="Build scaffolds from an 'orderings.bed' file")
    parser.add_argument("agp",
                        metavar="<ragtag.correction.agp>",
                        type=str,
                        help="AGP v2.1 file produced by 'ragtag.py correct'")
    parser.add_argument(
        "query",
        metavar="<query.fasta>",
        type=str,
        help="query fasta file to be scaffolded. must not be gzipped")

    args = parser.parse_args()
    agp_file = args.agp
    query_file = args.query

    x = pysam.FastaFile(query_file)
    agp = AGPFile(agp_file)

    # Iterate through the agp file
    for line in agp.iterate_lines():
        if line.is_gap:
            raise ValueError("The AGP file should have no gaps.")
        if line.orientation == "-":
            raise ValueError("No sequences should have a '-' orientation.")
        start, end = int(line.obj_beg) - 1, int(line.obj_end)
        print(">" + line.comp)
        print(x.fetch(line.obj, start, end))
Beispiel #2
0
def main():
    parser = argparse.ArgumentParser(description="Rename FASTA records.", usage="ragtag_rename.py <seqs.fa> [-p PREFIX]")
    parser.add_argument("fasta_fn", metavar="<seqs.fa>", default="", type=str, help="FASTA file (uncompressed or bgzipped)")
    parser.add_argument("-p", metavar="STR", type=str, default="", help="prefix")
    parser.add_argument("-o", metavar="PATH", type=str, default="ragtag.rename.agp", help="output AGP file path [./ragtag.rename.agp]")

    args = parser.parse_args()
    fasta_fn = args.fasta_fn
    prefix = args.p
    agp_fn = args.o

    agp = AGPFile(agp_fn, "w")
    record_idx = 0
    fai = pysam.FastaFile(fasta_fn)
    for reference in fai.references:
        agp.add_seq_line(
            prefix + "{0:08}".format(record_idx),
            1,
            fai.get_reference_length(reference),
            "1",
            "W",
            reference,
            1,
            fai.get_reference_length(reference),
            "+"
        )
        print(">" + prefix + "{0:08}".format(record_idx))
        print(fai.fetch(reference))

        record_idx += 1

    agp.write()
    fai.close()
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser(
        description="Break corrected query sequences (objects) into components."
    )
    parser.add_argument("agp",
                        metavar="<ragtag.correction.agp>",
                        type=str,
                        help="AGP v2.1 file produced by 'ragtag.py correct'")
    parser.add_argument(
        "query",
        metavar="<query.fasta>",
        type=str,
        help=
        "query fasta file corresponding to objects in <ragtag.correction.agp> (can be uncompressed or bgzipped"
    )

    args = parser.parse_args()
    agp_file = args.agp
    query_file = args.query

    fai = pysam.FastaFile(query_file)
    agp = AGPFile(agp_file, mode="r")

    # Iterate through the agp file
    for line in agp.iterate_lines():
        if line.is_gap:
            raise ValueError("The AGP file should have no gaps.")
        if line.orientation == "-":
            raise ValueError("No sequences should have a '-' orientation.")
        start, end = int(line.obj_beg) - 1, int(line.obj_end)
        print(">" + line.comp)
        print(fai.fetch(line.obj, start, end))

    fai.close()
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser(
        description="Build sequences in FASTA format from an AGP v2.1 file.",
        usage="ragtag.py agp2fa <scaffolds.agp> <components.fasta>")
    parser.add_argument("agp",
                        metavar="<scaffolds.agp>",
                        nargs='?',
                        default="",
                        type=str,
                        help="AGP v2.1 file")
    parser.add_argument(
        "components",
        metavar="<components.fasta>",
        nargs='?',
        default="",
        type=str,
        help="component FASTA file (can be uncompressed or bgzipped)")

    args = parser.parse_args()
    if not args.agp or not args.components:
        parser.print_help()
        sys.exit()

    agp_file = args.agp
    components_file = args.components

    fai = pysam.FastaFile(components_file)
    agp = AGPFile(agp_file, mode="r")

    # Iterate over the lines of the AGP file
    prev_obj = None
    is_first = True
    for agp_line in agp.iterate_lines():
        if agp_line.obj != prev_obj:
            if is_first:
                print(">" + agp_line.obj)
                is_first = False
            else:
                print("\n>" + agp_line.obj)

            prev_obj = agp_line.obj

        if agp_line.is_gap:
            sys.stdout.write("N" * agp_line.gap_len)
        else:
            if agp_line.orientation == "-":
                sys.stdout.write(
                    reverse_complement(
                        fai.fetch(agp_line.comp, agp_line.comp_beg - 1,
                                  agp_line.comp_end)))
            else:
                sys.stdout.write(
                    fai.fetch(agp_line.comp, agp_line.comp_beg - 1,
                              agp_line.comp_end))

    # End the FASTA file with a newline
    sys.stdout.write("\n")
    fai.close()
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser(description="Calculate scaffolding statistics")
    parser.add_argument("agp", nargs='?', default="", metavar="<ragtag.scaffolds.agp>", type=str, help="RagTag scaffolding AGP file")
    parser.add_argument("confidence", nargs='?', default="", metavar="<ragtag.confidence.txt>", type=str, help="RagTag scaffolding confidence scores file")

    args = parser.parse_args()

    if not args.agp or not args.confidence:
        parser.print_help()
        sys.exit()

    agp_file = args.agp
    confidence_file = args.confidence

    placed_bp = 0
    placed_seq = 0
    unplaced_bp = 0
    unplaced_seq = 0
    gap_bp = 0
    gap_seq = 0

    allowed_seq_types = {"A", "D", "F", "G", "O", "P", "W"}
    allowed_gap_types = {"N", "U"}

    # Get the set of placed sequences from the confidence scores file
    placed_seqs = set()
    with open(confidence_file, "r") as f:
        f.readline()  # discard header
        for line in f:
            header, g_score, l_score, o_score = line.rstrip().split("\t")
            placed_seqs.add(header)

    # Iterate through the AGP file
    agp = AGPFile(agp_file, mode="r")
    for line in agp.iterate_lines():
        if line.is_gap:
            gap_bp += line.gap_len
            gap_seq += 1
        else:
            seq_len = line.comp_end - (line.comp_beg - 1)
            if line.comp in placed_seqs:
                placed_bp += seq_len
                placed_seq += 1
            else:
                unplaced_bp += seq_len
                unplaced_seq += 1

    print("placed_sequences\tplaced_bp\tunplaced_sequences\tunplaced_bp\tgap_bp\tgap_sequences")
    print("\t".join([
        str(placed_seq),
        str(placed_bp),
        str(unplaced_seq),
        str(unplaced_bp),
        str(gap_bp),
        str(gap_seq)
    ]))
Beispiel #6
0
def sub_update(gff_file, agp_file):
    # Make a dictionary associating each original sequence with an interval tree of component sequences
    trans = defaultdict(IntervalTree)
    agp = AGPFile(agp_file, mode="r")
    for agp_line in agp.iterate_lines():

        # Check that the agp file looks correct for this task
        if agp_line.orientation == "-":
            raise ValueError(
                "The placement BED file is not formatted correctly. No sequences should be reverse complemented for misassembly correction."
            )
        if not agp_line.comp_type == "W":
            raise ValueError(
                "The placement BED file is not formatted correctly. All lines should be WGS contig (W)."
            )
        if agp_line.is_gap:
            raise ValueError(
                "There should be no gaps in the correction AGP file.")

        start, end = agp_line.obj_beg - 1, agp_line.obj_end
        trans[agp_line.obj][start:end] = agp_line.comp

    # Iterate through the gff intervals and update them according to trans
    with open(gff_file, "r") as f:
        for line in f:
            line = line.rstrip()
            if line.startswith("#"):
                print(line)  # Print this comment line
            else:
                fields = line.split("\t")
                h, s, e = fields[0], int(fields[3]), int(fields[4])
                s -= 1  # Keep everything zero-indexed

                if h not in trans:
                    raise ValueError("Inconsistent input files.")

                ovlps = trans[h][s:e]
                if len(ovlps) > 1:
                    raise ValueError(
                        "%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Make sure to run 'ragtag.py correct' with '--gff'"
                        % (h, s, e))
                if len(ovlps) < 1:
                    raise ValueError(
                        "The placement BED file is not formatted correctly.")

                # Get the data from the overlapping interval and print the new line
                o = list(ovlps)[0]
                new_s = s - o.begin
                new_e = e - o.begin
                fields[0] = o.data
                fields[3] = str(new_s +
                                1)  # back to one-based indexing for gff format
                fields[4] = str(new_e)
                print("\t".join(fields))
Beispiel #7
0
def write_breaks(out_file, query_file, ctg_breaks, overwrite, remove_suffix):
    """ Write the intermediate file for contig breaks in AGP v2.1 format."""
    # Check if the output file already exists
    if os.path.isfile(out_file):
        if not overwrite:
            log("Retaining pre-existing file: " + out_file)
            return

        else:
            log("Overwriting pre-existing file: " + out_file)

    fai = pysam.FastaFile(query_file)
    all_q_seqs = sorted(fai.references)
    agp = AGPFile(out_file, mode="w")

    agp.add_pragma()
    agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version()))

    for q in all_q_seqs:

        # Check if this sequence was broken during misassembly correction
        if q not in ctg_breaks:

            # Add suffix to query header, unless otherwise requested
            unchanged_comp_header = q
            if not remove_suffix:
                unchanged_comp_header = q + ":0" + "-" + str(
                    fai.get_reference_length(q)) + "(+)"

            agp.add_seq_line(q, "1", str(fai.get_reference_length(q)), "1",
                             "W", unchanged_comp_header, "1",
                             str(fai.get_reference_length(q)), "+")
        else:  # This query sequence was broken
            pid = 1
            sorted_breaks = sorted(ctg_breaks[q])
            start = 0
            for i in sorted_breaks:
                agp.add_seq_line(q, str(start + 1), str(i), str(pid), "W",
                                 q + ":" + str(start) + "-" + str(i) + "(+)",
                                 "1", str(i - start), "+")
                start = i
                pid += 1

            # Add one line for the last interval
            agp.add_seq_line(
                q, str(start + 1), str(fai.get_reference_length(q)), str(pid),
                "W", q + ":" + str(start) + "-" +
                str(fai.get_reference_length(q)) + "(+)", "1",
                str(fai.get_reference_length(q) - start), "+")

    log("Writing: " + out_file)
    agp.write()
    fai.close()
Beispiel #8
0
def sup_update(gff_file, agp_file):
    # Make a dictionary associating each original sequence with the destination sequence
    trans = {}
    strands = {}
    seq_lens = {}
    agp = AGPFile(agp_file, mode="r")
    for agp_line in agp.iterate_lines():
        if not agp_line.is_gap:
            start, end = agp_line.obj_beg - 1, agp_line.obj_end
            trans[agp_line.comp] = (start, end, agp_line.obj)
            strands[agp_line.comp] = agp_line.orientation
            seq_lens[agp_line.comp] = end - start

    # Iterate through the gff intervals and update them according to trans
    with open(gff_file, "r") as f:
        for line in f:
            line = line.rstrip()
            if line.startswith("#"):
                print(line)  # Print this comment line
            else:
                fields = line.split("\t")
                h, s, e, st = fields[0], int(fields[3]), int(
                    fields[4]), fields[6]
                s -= 1  # Keep everything zero-indexed

                if h not in trans:
                    print()
                    print(line)
                    raise ValueError("Inconsistent input files.")

                # Check if the original sequence has been reverse complemented
                if strands[h] == "-":
                    l = seq_lens[h]
                    s, e = l - e, l - s
                    if st == "+":
                        st = "-"
                    else:
                        st = "+"

                new_s = trans[h][0] + s
                new_e = trans[h][0] + e
                fields[0] = trans[h][2]
                fields[3] = str(new_s +
                                1)  # back to one-based indexing for gff format
                fields[4] = str(new_e)
                fields[6] = st
                print("\t".join(fields))
Beispiel #9
0
def main():
    parser = argparse.ArgumentParser(
        description="Build sequences in FASTA format from an AGP v2.1 file. ")
    parser.add_argument("agp",
                        metavar="<scaffolds.agp>",
                        type=str,
                        help="AGP v2.1 file")
    parser.add_argument(
        "components",
        metavar="<components.fasta>",
        type=str,
        help=
        "FASTA file with component sequences to be scaffolded. must not be gzipped"
    )

    args = parser.parse_args()
    agp_file = args.agp
    components_file = args.components

    fai = pysam.FastaFile(components_file)
    agp = AGPFile(agp_file)

    # Iterate over the lines of the AGP file
    prev_obj = None
    is_first = True
    for agp_line in agp.iterate_lines():
        if agp_line.obj != prev_obj:
            if is_first:
                print(">" + agp_line.obj)
                is_first = False
            else:
                print("\n>" + agp_line.obj)

            prev_obj = agp_line.obj

        if agp_line.is_gap:
            sys.stdout.write("N" * agp_line.gap_len)
        else:
            if agp_line.orientation == "-":
                sys.stdout.write(reverse_complement(fai.fetch(agp_line.comp)))
            else:
                sys.stdout.write(fai.fetch(agp_line.comp))

    # End the FASTA file with a newline
    sys.stdout.write("\n")
Beispiel #10
0
def main():
    parser = argparse.ArgumentParser(description="Check AGP v2.1 files for validity.", usage="ragtag.py agpcheck <asm1.agp> [<asm2.agp> ... <asmN.agp>]")
    parser.add_argument("agp", metavar="<asm1.agp> [<asm2.agp> ... <asmN.agp>]", nargs='+', default=[], type=str, help="AGP v2.1 files")

    DISCLAIMER = """
    DISCLAIMER:
    This utility performs most (but not all) checks necessary to validate an
    AGP v2.1 file: https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Specification/
    
    Please additionally use the NCBI AGP validator for robust
    validation: https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Validation/
    """

    args = parser.parse_args()

    print(DISCLAIMER)
    agp_file_list = [os.path.abspath(i) for i in args.agp]
    for agp_file in agp_file_list:
        print()
        log("INFO", "Checking {} ...".format(agp_file))
        agp = AGPFile(agp_file, mode="r")
        for _ in agp.iterate_lines():
            pass
        log("INFO", "Check for {} is complete with no errors.".format(agp_file))
Beispiel #11
0
    def write_agp(self, agp_fn, ref_fn, add_suffix_to_unplaced=False):
        """
        Write the AGP file implied by the scaffold graph
        :param agp_fn: AGP file name
        :param ref_fn: reference FASTA file name
        :param add_suffix_to_unplaced: add "_RagTag" to unscaffolded sequences
        """
        used_components = set()
        used_edges = set()
        obj_header_idx = -1

        agp = AGPFile(agp_fn, "w")
        agp.add_pragma()
        agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version()))

        while True:
            # Find a starting node
            from_node = None
            to_node = None
            cur_ref = None
            for u, v in sorted(self.edges):
                if (u, v) not in used_edges:
                    u_base = u[:-2]

                    u_degree = 0
                    if u_base + "_b" in self.nodes:
                        u_degree += self.graph.degree[u_base + "_b"]
                    if u_base + "_e" in self.nodes:
                        u_degree += self.graph.degree[u_base + "_e"]

                    assert u_degree in {2, 4}

                    # Check if we have found a starting target sequence
                    if u_degree == 2:
                        cur_ref = u_base
                        from_node = u
                        to_node = v
                        used_edges.add((u, v))
                        used_edges.add((v, u))
                        break

            # If we haven't found a new starting target sequence, we are done
            if from_node is None:
                break

            # Initialize this object
            obj_header_idx += 1
            obj_header = "scf" + "{0:08}".format(obj_header_idx)
            obj_pos = 0
            obj_pid = 1

            # Process the first target sequence
            cur_ref_len = self.component_lens[cur_ref]
            cur_ref_strand = "+"
            if from_node.endswith("_b"):
                cur_ref_strand = "-"
            agp.add_seq_line(obj_header, obj_pos+1, obj_pos+cur_ref_len, obj_pid, "W", cur_ref, 1, cur_ref_len, cur_ref_strand)
            obj_pos += cur_ref_len
            obj_pid += 1
            used_components.add(cur_ref)

            # Process the remaining sequences.
            next_edge_exists = True
            while next_edge_exists:
                # Process the patch
                patch_aln = self.graph[from_node][to_node]["alignment"]
                patch_query = patch_aln.query
                patch_strand = "+"
                if patch_aln.strand:
                    patch_strand = "-"

                patch_len = patch_aln.their_query_start - patch_aln.my_query_end
                if patch_len > 0:
                    if patch_aln.is_gap:
                        agp.add_gap_line(obj_header, obj_pos+1, obj_pos+patch_len, obj_pid, "N", patch_len, "scaffold", "yes", "align_genus")
                    else:
                        agp.add_seq_line(obj_header, obj_pos+1, obj_pos+patch_len, obj_pid, "W", patch_query, patch_aln.my_query_end+1, patch_aln.their_query_start, patch_strand)
                        used_components.add(patch_query)
                    obj_pos += patch_len
                    obj_pid += 1

                # Next, process the reference sequence
                comp_start = min(0, patch_len)
                cur_ref = to_node[:-2]
                cur_ref_len = self.component_lens[cur_ref]
                cur_ref_strand = "+"
                if to_node.endswith("_e"):
                    cur_ref_strand = "-"
                agp.add_seq_line(obj_header, obj_pos+1, obj_pos+(cur_ref_len + comp_start), obj_pid, "W", cur_ref, 1+(-1*comp_start), cur_ref_len, cur_ref_strand)
                obj_pos += cur_ref_len + comp_start
                obj_pid += 1
                used_components.add(cur_ref)

                # Look for the next edge
                from_node = to_node[:-2] + "_b"
                if to_node.endswith("_b"):
                    from_node = to_node[:-2] + "_e"

                if from_node in self.graph.nodes:
                    next_nodes = set(self.graph[from_node])
                    assert len(next_nodes) == 1
                    to_node = next_nodes.pop()
                    used_edges.add((from_node, to_node))
                    used_edges.add((to_node, from_node))
                else:
                    next_edge_exists = False

        # Write unplaced reference sequences
        fai = pysam.FastaFile(ref_fn)
        all_ref_seqs = set(fai.references)
        fai.close()
        remaining_components = all_ref_seqs - used_components
        for c in sorted(remaining_components):
            agp.add_seq_line(
                c + "_RagTag" * add_suffix_to_unplaced,
                "1",
                str(self.component_lens[c]),
                "1",
                "W",
                c,
                "1",
                str(self.component_lens[c]),
                "+"
            )

        agp.write()
Beispiel #12
0
    def _get_assembly_points(self, agp, weight):
        """
        Find all adjacencies defined in an AGP file
        :param agp: An AGP file defining sequence adjacencies
        :param weight: The weight to assign to each adjacency
        """
        comps = set()
        prev_obj = ""
        seq1 = ""
        strand1 = ""

        # Gap info
        gap_count = 0
        prev_agp_known = None
        prev_gap_size = 0
        prev_gap_type = None
        prev_linkage = ""
        prev_evidence = ""

        # Iterate over the AGP file and yield assembly points
        agp_file = AGPFile(agp)
        for agp_line in agp_file.iterate_lines():
            if not agp_line.is_gap:
                # Add this component to our master list
                if agp_line.comp not in self.component_lens:
                    raise RuntimeError("{} is in {} but not {}.".format(agp_line.comp, agp, self.components_fasta_fname))
                comps.add(agp_line.comp)

                comp_len = agp_line.comp_end
                if comp_len < self.get_component_len(agp_line.comp):
                    raise RuntimeError("only complete components can be added to the graph.")

                if comp_len > self.get_component_len(agp_line.comp):
                    raise RuntimeError("inconsistent component lengths: {} bp in {} and {} bp in {}". format(comp_len, agp, self.get_component_len(agp_line.comp), self.components_fasta_fname))

                if agp_line.obj == prev_obj:
                    # Check if these components are bookended (no gap in between)
                    if not gap_count:
                        prev_evidence = "bookend"

                    # Check if two consecutive gaps preceded this component
                    if gap_count > 1:
                        raise ValueError("Consecutive gaps in the AGP file are not currently supported.")

                    yield AssemblyPoint(
                        seq1,
                        strand1,
                        agp_line.comp,
                        agp_line.orientation,
                        weight,
                        agp,
                        prev_agp_known,
                        prev_gap_size,
                        prev_gap_type,
                        prev_linkage,
                        prev_evidence
                    )

                    # Set this component as the previous component
                    seq1 = agp_line.comp
                    strand1 = agp_line.orientation

                    gap_count = 0
                    prev_agp_known = None
                    prev_gap_size = 0
                    prev_gap_type = None
                    prev_linkage = ""
                    prev_evidence = ""

                else:
                    seq1 = agp_line.comp
                    strand1 = agp_line.orientation

                    prev_obj = agp_line.obj
                    gap_count = 0
                    prev_agp_known = None
                    prev_gap_size = 0
                    prev_gap_type = None
                    prev_linkage = ""
                    prev_evidence = ""
            else:
                if agp_line.obj == prev_obj:
                    gap_count += 1
                    prev_agp_known = True if agp_line.comp_type == "N" else False
                    prev_gap_size = agp_line.gap_len
                    prev_gap_type = agp_line.gap_type
                    prev_linkage = True if agp_line.linkage == "yes" else False
                    prev_evidence = agp_line.linkage_evidence

        if comps != self.components:
            raise ValueError("Input AGPs do not have the same set of components.")
Beispiel #13
0
def write_orderings(out_agp_file, out_confidence_file, query_file,
                    ordering_dict, ctg_dict, gap_dict, gap_type_dict,
                    make_chr0, overwrite, add_suffix):
    # Check if the output file already exists
    if os.path.isfile(out_agp_file):
        if not overwrite:
            log("Retaining pre-existing file: " + out_agp_file)
            return

        else:
            log("Overwriting pre-existing file: " + out_agp_file)

    # Proceed with writing the intermediate output
    placed_seqs = set()
    all_out_cs_lines = []  # For confidence scores
    agp = AGPFile(out_agp_file, mode="w")

    agp.add_pragma()
    agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version()))

    # Go through the reference sequences in sorted order
    sorted_ref_headers = sorted(list(ordering_dict.keys()))
    for ref_header in sorted_ref_headers:
        pid = 1
        pos = 0
        new_ref_header = ref_header + "_RagTag"
        q_seqs = ordering_dict[ref_header]
        gap_seqs = gap_dict[ref_header]
        gap_types = gap_type_dict[ref_header]

        # Iterate through the query sequences for this reference header
        for i in range(len(q_seqs)):
            out_agp_line = []
            out_cs_line = []
            q = q_seqs[i][2]
            placed_seqs.add(q)
            qlen = ctg_dict[q].query_len
            strand = ctg_dict[q].orientation
            gc, lc, oc = ctg_dict[q].grouping_confidence, ctg_dict[
                q].location_confidence, ctg_dict[q].orientation_confidence
            out_agp_line.append(new_ref_header)
            out_agp_line.append(str(pos + 1))
            pos += qlen
            out_agp_line.append(str(pos))
            out_agp_line.append(str(pid))
            out_agp_line.append("W")
            out_agp_line.append(q)
            out_agp_line.append("1")
            out_agp_line.append(str(ctg_dict[q].query_len))
            out_agp_line.append(strand)

            # Save the confidence score info
            out_cs_line.append(q)
            out_cs_line.append(str(gc))
            out_cs_line.append(str(lc))
            out_cs_line.append(str(oc))

            agp.add_seq_line(*out_agp_line)
            all_out_cs_lines.append("\t".join(out_cs_line))
            pid += 1

            if i < len(gap_seqs):
                # Print the gap line
                out_agp_line = []
                out_agp_line.append(new_ref_header)
                out_agp_line.append(str(pos + 1))
                pos += gap_seqs[i]
                out_agp_line.append(str(pos))
                out_agp_line.append(str(pid))
                gap_type = gap_types[i]
                out_agp_line.append(gap_type)
                out_agp_line.append(str(gap_seqs[i]))
                out_agp_line.append("scaffold")
                out_agp_line.append("yes")
                out_agp_line.append("align_genus")
                pid += 1
                agp.add_gap_line(*out_agp_line)

    # Write unplaced sequences
    fai = pysam.FastaFile(query_file)
    all_seqs = set(fai.references)
    unplaced_seqs = sorted(list(all_seqs - placed_seqs))
    if unplaced_seqs:
        if make_chr0:
            pos = 0
            pid = 1
            new_ref_header = "Chr0_RagTag"
            for q in unplaced_seqs:
                out_agp_line = []
                qlen = fai.get_reference_length(q)
                out_agp_line.append(new_ref_header)
                out_agp_line.append(str(pos + 1))
                pos += qlen
                out_agp_line.append(str(pos))
                out_agp_line.append(str(pid))
                out_agp_line.append("W")
                out_agp_line.append(q)
                out_agp_line.append("1")
                out_agp_line.append(str(qlen))
                out_agp_line.append("+")

                agp.add_seq_line(*out_agp_line)
                pid += 1

                # Now for the gap, since we are making a chr0
                out_agp_line = []
                out_agp_line.append(new_ref_header)
                out_agp_line.append(str(pos + 1))
                pos += 100
                out_agp_line.append(str(pos))
                out_agp_line.append(str(pid))
                out_agp_line.append("U")
                out_agp_line.append("100")
                out_agp_line.append("contig")
                out_agp_line.append("no")
                out_agp_line.append("na")

                agp.add_gap_line(*out_agp_line)
                pid += 1

            # Remove the final unecessary gap
            agp.pop_agp_line()
        else:
            # List the unplaced contigs individually
            for q in unplaced_seqs:
                out_agp_line = []
                qlen = fai.get_reference_length(q)
                if add_suffix:
                    out_agp_line.append(q + "_RagTag")
                else:
                    out_agp_line.append(q)
                out_agp_line.append("1")
                out_agp_line.append(str(qlen))
                out_agp_line.append("1")
                out_agp_line.append("W")
                out_agp_line.append(q)
                out_agp_line.append("1")
                out_agp_line.append(str(qlen))
                out_agp_line.append("+")
                agp.add_seq_line(*out_agp_line)

    agp.write()
    fai.close()

    # Write the confidence scores
    with open(out_confidence_file, "w") as f:
        f.write(
            "query\tgrouping_confidence\tlocation_confidence\torientation_confidence\n"
        )
        f.write("\n".join(all_out_cs_lines) + "\n")
Beispiel #14
0
def write_agp_solution(cover_graph, scaffold_graph, agp_fname, gap_func="MIN", add_suffix_to_unplaced=False):
    """
    Here, we work with two graphs: A cover_graph and a scaffold_graph. A covergrpah defines a solution to the scaffold
    graph, and nodes from the same component are connected for convenience.

    We use the scaffold_graph for any original scaffold_graph info/functionality
    """
    if not isinstance(scaffold_graph, ScaffoldGraphBase):
        raise TypeError("scaffold_graph must be an instance of ScaffoldGraph")

    placed_components = set()

    # Iterate over each connected component
    agp = AGPFile(agp_fname, mode="w")
    agp.add_pragma()
    agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version()))

    # Iterate through the connected components
    for i, cc in enumerate(nx.connected_components(G=cover_graph)):
        # Sort the list of nodes for deterministic output
        cc = sorted(list(cc))
        obj_header = "scf" + "{0:08}".format(i+1) + "_RagTag"
        current_node = None

        # Iterate over each node in the connected component until we find a node with degree=1
        for node in cc:
            if cover_graph.degree[node] == 1:
                current_node = node
                break

        assert current_node is not None

        # Starting with the degree=1 node, build the AGP object from nodes in the path.
        visited_nodes = {current_node}
        degree = 0
        obj_id = 1
        obj_pos = 0

        # Traverse the component until we find the other end node
        while degree != 1:
            conn_nodes = set(cover_graph.neighbors(current_node))
            next_node = (conn_nodes - visited_nodes).pop()
            degree = cover_graph.degree[next_node]
            comp_len = scaffold_graph.get_component_len(next_node[:-2])

            # Check if this is an intra or inter sequence edge
            orientation = "+"
            if next_node[:-2] == current_node[:-2]:
                if next_node.endswith("_b"):
                    orientation = "-"
                    assert current_node.endswith("_e")

                agp.add_seq_line(
                    obj_header,
                    str(obj_pos + 1),
                    str(obj_pos + comp_len),
                    str(obj_id),
                    "W",
                    next_node[:-2],
                    "1",
                    str(comp_len),
                    orientation
                )
                obj_pos += comp_len
                placed_components.add(next_node[:-2])
            else:
                # Organize the gap info
                adjacency_data = scaffold_graph[current_node][next_node]

                # AGP Column 5
                all_is_known_gap_size = adjacency_data["is_known_gap_size"]
                comp_type = "N" if any(all_is_known_gap_size) else "U"

                # AGP column 6b
                gap_size = 100
                all_gap_sizes = adjacency_data["gap_size"]
                fltrd_gap_sizes = [all_gap_sizes[i] for i in range(len(all_gap_sizes)) if all_is_known_gap_size[i]]
                if fltrd_gap_sizes:
                    if len(fltrd_gap_sizes) == 1:
                        gap_size = fltrd_gap_sizes[0]
                    else:
                        gap_size = get_gap_size(fltrd_gap_sizes, gap_func)

                # AGP column 7b
                all_gap_types = set(adjacency_data["gap_type"])
                gap_type = "scaffold"
                if len(all_gap_types) == 1:
                    gap_type = all_gap_types.pop()

                # AGP column 8b
                has_linkage = "yes" if any(adjacency_data["linkage"]) else "no"

                # AGP column 9b
                all_evidences = set(adjacency_data["linkage_evidence"])
                linkage_evidence = "na"
                if has_linkage == "yes":
                    if "na" in all_evidences:
                        all_evidences.remove("na")
                    linkage_evidence = ";".join([str(i) for i in all_evidences])

                agp.add_gap_line(
                    obj_header,
                    str(obj_pos + 1),
                    str(obj_pos + gap_size),
                    str(obj_id),
                    comp_type,
                    str(gap_size),
                    gap_type,
                    has_linkage,
                    linkage_evidence
                )
                obj_pos += gap_size

            obj_id += 1
            visited_nodes.add(next_node)
            current_node = next_node

    # Write all unplaced contigs
    remaining_components = scaffold_graph.components - placed_components
    for c in remaining_components:
        agp.add_seq_line(
            c + "_RagTag" * add_suffix_to_unplaced,
            "1",
            str(scaffold_graph.get_component_len(c)),
            "1",
            "W",
            c,
            "1",
            str(scaffold_graph.get_component_len(c)),
            "+"
        )

    agp.write()
Beispiel #15
0
def lens_from_agp(fname):
    agp_file = AGPFile(fname, mode="r")
    return [obj.obj_len for obj in agp_file.iterate_objs()]
Beispiel #16
0
def main():
    parser = argparse.ArgumentParser(description='Split sequencs at gaps',
                                     usage="ragtag.py splitasm <asm.fa>")
    parser.add_argument("asm",
                        metavar="<asm.fa>",
                        default="",
                        type=str,
                        help="assembly fasta file (uncompressed or bgzipped)")
    parser.add_argument("-n",
                        metavar="INT",
                        type=int,
                        default=0,
                        help="minimum gap size [0]")
    parser.add_argument("-o",
                        metavar="PATH",
                        type=str,
                        default="ragtag.splitasm.agp",
                        help="output AGP file path [./ragtag.splitasm.agp]")

    # Parse the command line arguments
    args = parser.parse_args()
    if not args.asm:
        parser.print_help()
        print("\n** The assembly FASTA file is required **")
        sys.exit()

    asm_fn = args.asm
    min_gap_size = args.n
    agp_fn = args.o

    # Initialize the AGP file
    agp = AGPFile(agp_fn, mode="w")
    agp.add_pragma()
    agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version()))

    # Process the FASTA file
    new_header_idx = 0
    fai = pysam.FastaFile(asm_fn)
    for header in sorted(fai.references):
        seq = fai.fetch(header).upper()
        seq_len = fai.get_reference_length(header)
        gap_coords = [(i.start(), i.end()) for i in re.finditer(r'N+', seq)
                      if i.end() - i.start() > min_gap_size]

        if not gap_coords:
            new_header = "seq{0:08}".format(new_header_idx)
            new_header_idx += 1
            agp.add_seq_line(header, "1", seq_len, "1", "W", new_header, "1",
                             seq_len, "+")
        else:
            gap_coords.append((seq_len, seq_len + 1))
            pid = 1
            if gap_coords[0][0]:
                # The sequence doesn't start with a gap
                new_header = "seq{0:08}".format(new_header_idx)
                agp.add_seq_line(header, "1", str(gap_coords[0][0]),
                                 str(pid), "W", new_header, "1",
                                 str(gap_coords[0][0]), "+")
                new_header_idx += 1
                pid += 1

            for i in range(1, len(gap_coords)):
                # Add the gap line
                gap_start, gap_end = gap_coords[i - 1][0], gap_coords[i - 1][1]
                gap_len = gap_end - gap_start
                agp.add_gap_line(header, str(gap_start + 1), str(gap_end),
                                 str(pid), "N", str(gap_len), "scaffold",
                                 "yes", "align_genus")
                pid += 1

                # Add the sequence line
                obj_start, obj_end = gap_coords[i - 1][1], gap_coords[i][0]
                comp_len = obj_end - obj_start
                new_header = "seq{0:08}".format(new_header_idx)
                if gap_coords[i - 1][1] != seq_len:
                    agp.add_seq_line(header, str(obj_start + 1), obj_end, pid,
                                     "W", new_header, "1", str(comp_len), "+")
                    new_header_idx += 1
                    pid += 1

        agp.write()

    # Iterate over the AGP file and print the sequences
    agp = AGPFile(agp_fn, mode="r")
    for line in agp.iterate_lines():
        if not line.is_gap:
            obj, comp, obj_beg, obj_end = line.obj, line.comp, line.obj_beg, line.obj_end
            print(">" + comp)
            print(fai.fetch(obj, obj_beg - 1, obj_end))

    fai.close()