Exemple #1
0
def _reestimate_distances(graph, scaffolds, contigs_fasta):
    """
    Estimates distances between contigs using overlap graph
    """
    restricted_nodes = set()
    for scf in scaffolds:
        for contig in scf.contigs:
            restricted_nodes.add("+" + contig.name())
            restricted_nodes.add("-" + contig.name())

    for scf in scaffolds:
        for prev_cont, next_cont in zip(scf.contigs[:-1], scf.contigs[1:]):
            src, dst = prev_cont.signed_name(), next_cont.signed_name()
            if graph.has_edge(src, dst):
                overlap = graph[src][dst]["label"]
                prev_cont.link.gap = -int(overlap)

            else:
                path = _shortest_path(graph, src, dst, restricted_nodes)
                if not path:
                    continue

                path_len = 0
                for node in path[1:-1]:
                    path_len += len(contigs_fasta[node[1:]])
                for n1, n2 in zip(path[:-1], path[1:]):
                    overlap = graph[n1][n2]["label"]
                    path_len -= int(overlap)

                prev_cont.link.gap = path_len
Exemple #2
0
def update_gaps(scaffolds):
    """
    Do it in the very end
    """
    for scf in scaffolds:
        for c1, c2 in zip(scf.contigs[:-1], scf.contigs[1:]):
            c1.link.gap -= c1.right_gap() + c2.left_gap()
    def _make_unplaced_fasta(self):
        """
        Creates unplaced (not used in scaffolds) sequences in FASTA format
        """
        used_ranges_by_seq = defaultdict(list)
        for scf in self.scaffolds:
            for ctg in scf.contigs:
                seq_name, seq_start, seq_end = ctg.name_with_coords()
                used_ranges_by_seq[seq_name].append((seq_start, seq_end))
        for seq_name in self.fragments_fasta:
            seq_len = len(self.fragments_fasta[seq_name])
            used_ranges_by_seq[seq_name].append((0, 0))
            used_ranges_by_seq[seq_name].append((seq_len, seq_len))
            used_ranges_by_seq[seq_name].sort()

        unused_ranges_by_seq = defaultdict(list)
        for seq_name in self.fragments_fasta:
            for range_1, range_2 in zip(used_ranges_by_seq[seq_name][:-1],
                                        used_ranges_by_seq[seq_name][1:]):
                if range_1[1] < range_2[0]:
                    unused_ranges_by_seq[seq_name].append((range_1[1],
                                                           range_2[0]))

        unplaced_fasta = {}
        for seq_name, unused_ranges in unused_ranges_by_seq.items():
            for ur in unused_ranges:
                if ur[0] == 0 and ur[1] == len(self.fragments_fasta[seq_name]):
                    fragment_name = seq_name
                else:
                    fragment_name = seq_name + "[{0}:{1}]".format(ur[0], ur[1])
                unplaced_fasta[fragment_name] = \
                                    self.fragments_fasta[seq_name][ur[0]:ur[1]]

        self.unplaced_fasta = unplaced_fasta
Exemple #4
0
    def alternating_cycle(self, node_1, node_2):
        """
        Determines if there is a cycle of alternating colors
        that goes through the given red-supported (!) edge
        """
        def get_genome_ids(xxx_todo_changeme):
            (u, v) = xxx_todo_changeme
            return self.genomes_support(u, v)

        good_path = False
        path = None
        for path in self._alternating_paths(node_1, node_2):
            assert len(path) % 2 == 0
            if len(path) == 2:
                continue

            edges = list(zip(path[:-1], path[1:]))
            even_colors = list(map(get_genome_ids, edges[1::2]))
            even_good = all(
                [set(e) == set([self.target]) for e in even_colors])
            if not even_good:
                continue

            odd_colors = [get_genome_ids(e) for e in edges[0::2]]
            common_genomes = set(odd_colors[0])
            for edge_colors in odd_colors:
                common_genomes = common_genomes.intersection(edge_colors)

            if common_genomes:
                #self._check_distances(path)
                good_path = True
                break

        return len(path) // 2 if good_path else None
Exemple #5
0
 def _build_adj_graph(self):
     adj_graph = nx.Graph()
     for scf in self.old_scaffolds:
         for cnt_1, cnt_2 in zip(scf.contigs[:-1], scf.contigs[1:]):
             adj_graph.add_edge(cnt_1.right_end(), cnt_2.left_end())
         for cnt in scf.contigs:
             adj_graph.add_edge(cnt.left_end(), cnt.right_end())
         #chromosome ends
         adj_graph.add_edge(scf.contigs[-1].right_end(),
                            scf.contigs[0].left_end())
     self.adj_graph = adj_graph
def output_links(scaffolds, out_links):
    """
    Outputs pretty table with information about adjacencies
    """
    HEADER = ["sequence", "start", "length", "gap", "support"]
    COL_GAP = 4

    with open(out_links, "w") as f:
        for scf in sorted(scaffolds, key=lambda s: s.name):
            rows = []
            cur_pos = 0

            for contig in scf.contigs:
                start = cur_pos
                cur_pos = start + contig.length() + contig.link.gap
                support = _support_to_string(contig.link)

                rows.append([contig.signed_name(), str(start),
                            str(contig.length()), str(contig.link.gap),
                            support])

            col_widths = repeat(0)
            for row in [HEADER] + rows:
                col_widths = [max(len(v), w) for v, w in zip(row, col_widths)]
            line_len = sum(col_widths) + COL_GAP * len(col_widths)

            #header
            f.write("-" * line_len + "\n")
            f.write(scf.name + "\n")
            f.write("-" * line_len + "\n")
            for hdr, width in zip(HEADER, col_widths):
                f.write(hdr + (" " * (width - len(hdr) + COL_GAP)))
            f.write("\n" + "-" * line_len + "\n")

            #values
            for row in rows:
                for val, width in zip(row, col_widths):
                    f.write(val + (" " * (width - len(val) + COL_GAP)))
                f.write("\n")

            f.write("-" * line_len + "\n\n")
Exemple #7
0
    def _genome_distance(self, genome_1, genome_2):
        """
        Calculates breakpoint distance between two genomes
        """
        breakpoints_1 = set()
        n_blocks_1 = 0
        for perm in self.perms_by_genome[genome_1]:
            n_blocks_1 += len(perm.blocks)
            for bl_1, bl_2 in zip(perm.blocks[:-1], perm.blocks[1:]):
                bp = sorted([-bl_1.signed_id(), bl_2.signed_id()])
                breakpoints_1.add(tuple(bp))

        breakpoints_2 = set()
        n_blocks_2 = 0
        for perm in self.perms_by_genome[genome_2]:
            n_blocks_2 += len(perm.blocks)
            for bl_1, bl_2 in zip(perm.blocks[:-1], perm.blocks[1:]):
                bp = sorted([-bl_1.signed_id(), bl_2.signed_id()])
                breakpoints_2.add(tuple(bp))

        return (min(len(breakpoints_1), len(breakpoints_2)) -
                len(breakpoints_1 & breakpoints_2))
Exemple #8
0
    def _build_bp_graph(self):
        """
        No repeats assumed!
        """
        old_contigs = set()
        for scf in self.old_scaffolds:
            for cnt in scf.contigs:
                old_contigs.add(cnt.name())

        ###creating 2-colored breakpoint graph
        bp_graph = nx.MultiGraph()
        for scf in self.old_scaffolds:
            for cnt_1, cnt_2 in zip(scf.contigs[:-1], scf.contigs[1:]):
                bp_graph.add_edge(cnt_1.right_end(), cnt_2.left_end(),
                                  scf_set="old", link=copy(cnt_1.link),
                                  scf_name=scf.name, infinity=False)
            #chromosome ends
            bp_graph.add_edge(scf.contigs[-1].right_end(),
                              scf.contigs[0].left_end(), scf_set="old",
                              infinity=True)

        for scf in self.new_scaffolds:
            prev_cont = None
            first_ctg = None
            pos = 0
            for pos, contig in enumerate(scf.contigs):
                if contig.name() in old_contigs:
                    prev_cont = deepcopy(contig)
                    first_ctg = prev_cont
                    break
            if prev_cont is None:
                continue

            for next_cont in scf.contigs[pos + 1:]:
                if next_cont.name() not in old_contigs:
                    prev_cont.link.gap += next_cont.length() + next_cont.link.gap
                    common_genomes = (set(prev_cont.link.supporting_genomes) &
                                      set(next_cont.link.supporting_genomes))
                    prev_cont.link.supporting_genomes = list(common_genomes)

                else:
                    bp_graph.add_edge(prev_cont.right_end(), next_cont.left_end(),
                                      scf_set="new", link=copy(prev_cont.link),
                                      scf_name=scf.name, infinity=False)
                    prev_cont = deepcopy(next_cont)

            bp_graph.add_edge(prev_cont.right_end(),
                              first_ctg.left_end(), scf_set="new",
                              infinity=True, link=None)

        self.bp_graph = bp_graph
Exemple #9
0
def _update_scaffolds(scaffolds, perm_container):
    """
    Updates scaffolds wrt to given permutations
    """
    perm_index = defaultdict(list)
    for perm in perm_container.target_perms:
        perm_index[(perm.chr_name, perm.repeat_id)].append(perm)

    new_scaffolds = []
    for scf in scaffolds:
        new_contigs = []
        for contig in scf.contigs:
            inner_perms = []
            for new_perm in perm_index[(contig.perm.chr_name,
                                        contig.perm.repeat_id)]:
                if (contig.perm.seq_start <= new_perm.seq_start <
                        contig.perm.seq_end):
                    inner_perms.append(new_perm)
                    assert (contig.perm.seq_start < new_perm.seq_end <=
                            contig.perm.seq_end)

            if not inner_perms:
                logger.debug("Lost: %s", str(contig.perm))
                continue

            inner_perms.sort(key=lambda p: p.seq_start,
                             reverse=contig.sign < 0)
            for prev_perm, next_perm in zip(inner_perms[:-1], inner_perms[1:]):
                if contig.sign > 0:
                    gap_length = next_perm.seq_start - prev_perm.seq_end
                else:
                    gap_length = prev_perm.seq_start - next_perm.seq_end
                support = [
                    GenChrPair(prev_perm.genome_name, prev_perm.chr_name)
                ]
                new_contigs.append(
                    Contig.with_perm(prev_perm, contig.sign,
                                     Link(gap_length, support)))
            new_contigs.append(
                Contig.with_perm(inner_perms[-1], contig.sign,
                                 copy(contig.link)))

        if len(new_contigs):
            new_scaffolds.append(
                Scaffold.with_contigs(scf.name, None, None, new_contigs))
    return new_scaffolds
Exemple #10
0
def _insert_from_graph(graph, scaffolds_in, max_path_len, contigs_fasta):
    """
    Inserts contigs from the assembly graph into scaffolds
    """
    new_scaffolds = []
    ordered_contigs = set()
    for scf in scaffolds_in:
        ordered_contigs |= set([c.name() for c in scf.contigs])
    reverse_graph = graph.reverse()

    for scf in scaffolds_in:
        new_scaffolds.append(Scaffold(scf.name))

        for prev_cont, new_cont in zip(scf.contigs[:-1], scf.contigs[1:]):
            new_scaffolds[-1].contigs.append(prev_cont)

            #find contigs to insert
            path_nodes = _get_cut_vertices(graph, reverse_graph, prev_cont,
                                           new_cont, max_path_len,
                                           ordered_contigs)

            if not path_nodes:
                continue

            #insert contigs along the path
            supp_genomes = prev_cont.link.supporting_genomes
            prev_cont.link.supporting_assembly = True
            prev_cont.link.gap = config.vals["min_scaffold_gap"]
            for node in path_nodes:
                sign = 1 if node[0] == "+" else -1
                name = node[1:]

                new_contig = Contig.with_sequence(name,
                                                  len(contigs_fasta[name]),
                                                  sign)
                new_contig.link.supporting_assembly = True
                new_contig.link.gap = config.vals["min_scaffold_gap"]
                new_contig.link.supporting_genomes = supp_genomes
                new_scaffolds[-1].contigs.append(new_contig)

        new_scaffolds[-1].contigs.append(scf.contigs[-1])

    return new_scaffolds
Exemple #11
0
    def _fix_gaps(self):
        """
        Handles negative gaps, ensures that gap values are
        within deined range
        """
        def get_seq(contig):
            seq_name, seg_start, seg_end = contig.name_with_coords()
            cont_seq = self.fragments_fasta[seq_name][seg_start:seg_end]
            if contig.sign < 0:
                cont_seq = reverse_complement(cont_seq)
            return cont_seq

        def count_ns(cnt_1, cnt_2):
            seq_1, seq_2 = get_seq(cnt_1), get_seq(cnt_2)

            left_ns, right_ns = 0, 0
            for i in range(len(seq_1) - 1, 0, -1):
                if seq_1[i].upper() != "N":
                    break
                left_ns += 1
            for i in range(len(seq_2) - 1):
                if seq_2[i].upper() != "N":
                    break
                right_ns += 1
            return left_ns, right_ns

        for scf in self.scaffolds:
            for cnt_1, cnt_2 in zip(scf.contigs[:-1], scf.contigs[1:]):
                if cnt_1.link.supporting_assembly:
                    cnt_1.trim_right(max(0, -cnt_1.link.gap))
                    cnt_1.link.gap = max(0, cnt_1.link.gap)
                    continue

                left_ns, right_ns = count_ns(cnt_1, cnt_2)

                cnt_1.trim_right(left_ns)
                cnt_2.trim_left(right_ns)
                cnt_1.link.gap += left_ns + right_ns
                cnt_1.link.gap = max(cnt_1.link.gap,
                                     config.vals["min_scaffold_gap"])
Exemple #12
0
def _merge_scaffolds(big_scaffolds, small_scaffolds):
    """
    Performs the final merging step
    """
    count_diff_scaf = 0
    count_diff_orient = 0
    count_inconsistent = 0

    total_success = 0
    total_fail = 0
    total_inserted = 0
    not_found = 0

    big_count = defaultdict(int)
    for scf in big_scaffolds:
        for c in scf.contigs:
            big_count[c.perm] += 1

    small_count = defaultdict(int)
    for scf in small_scaffolds:
        for c in scf.contigs:
            small_count[c.perm] += 1

    repeats = set(seq for (
        seq,
        count) in chain(list(big_count.items()), list(small_count.items()))
                  if count > 1)
    big_unique = set(seq for (seq, count) in big_count.items() if count == 1)

    small_index = {}
    for scf in small_scaffolds:
        for pos, contig in enumerate(scf.contigs):
            if contig.perm not in repeats:
                assert contig.perm not in small_index
                small_index[contig.perm] = (scf, pos)

    new_scafflods = []
    for big_scf in big_scaffolds:
        new_contigs = []
        #non_repeats = list(filter(lambda i: big_scf.contigs[i].perm
        #                                not in repeats,
        #                          xrange(len(big_scf.contigs))))
        non_repeats = [
            i for i in range(len(big_scf.contigs))
            if big_scf.contigs[i].perm not in repeats
        ]
        for left_idx, right_idx in zip(non_repeats[:-1], non_repeats[1:]):
            left_cnt = big_scf.contigs[left_idx]
            right_cnt = big_scf.contigs[right_idx]

            consistent = False
            weak_contigs = None
            link_to_change = None
            if (left_cnt.perm in small_index
                    and right_cnt.perm in small_index):
                consistent = True
                left_scf, left_pos = small_index[left_cnt.perm]
                right_scf, right_pos = small_index[right_cnt.perm]

                big_sign = left_cnt.sign == right_cnt.sign
                small_sign = (left_scf.contigs[left_pos].sign ==
                              right_scf.contigs[right_pos].sign)

                if left_scf != right_scf:
                    count_diff_scaf += 1
                    consistent = False
                elif big_sign != small_sign:
                    count_diff_orient += 1
                    consistent = False
                else:
                    same_dir = left_pos < right_pos
                    if not same_dir:
                        left_pos, right_pos = right_pos, left_pos

                    weak_contigs = left_scf.contigs[left_pos + 1:right_pos]
                    if any(c.perm in big_unique for c in weak_contigs):
                        count_inconsistent += 1
                        consistent = False

                    link_to_change = copy(left_scf.contigs[left_pos].link)
                    #reverse complement
                    if weak_contigs and not same_dir:
                        link_to_change = copy(left_scf.contigs[right_pos -
                                                               1].link)
                        weak_contigs = [
                            c.reverse_copy() for c in weak_contigs[::-1]
                        ]
                        for pw, nw in zip(weak_contigs[:-1], weak_contigs[1:]):
                            pw.link = copy(nw.link)
                        weak_contigs[-1].link = copy(
                            left_scf.contigs[left_pos].link)

            else:
                not_found += 1

            new_contigs.append(left_cnt)
            if consistent and weak_contigs:
                new_contigs[-1].link = link_to_change
                new_contigs.extend(weak_contigs)
                total_success += 1
                total_inserted += len(weak_contigs)
                #logger.debug("Inserting '{0}' between {1} and {2}"
                #             .format(map(lambda c: c.perm, weak_contigs),
                #                     left_cnt, right_cnt))
            else:
                new_contigs.extend(big_scf.contigs[left_idx + 1:right_idx])
                total_fail += 1

        if len(new_contigs) > 1:
            new_contigs.append(right_cnt)
            s = Scaffold(big_scf.name)
            s.contigs = new_contigs
            new_scafflods.append(s)
        else:  #because of repeats
            new_scafflods.append(big_scf)

    logger.debug("Fail: not found: %d", not_found)
    logger.debug("Fail: different scaffolds: %d", count_diff_scaf)
    logger.debug("Fail: different orientatilns: %d", count_diff_orient)
    logger.debug("Fail: inconsistent: %d", count_inconsistent)
    logger.debug("Total success: %d", total_success)
    logger.debug("Total fail: %d", total_fail)
    logger.debug("Total inserted: %d", total_inserted)

    num_contigs = 0
    for scf in new_scafflods:
        num_contigs += len(scf.contigs)
    logger.debug("Result: %d contigs in %d scaffolds", num_contigs,
                 len(new_scafflods))

    return new_scafflods
Exemple #13
0
 def iter_pairs(self):
     for pb, nb in zip(self.blocks[:-1], self.blocks[1:]):
         yield pb, nb