コード例 #1
0
ファイル: divergence.py プロジェクト: sebschmi/Flye
def _contig_profile(alignment, platform, genome_len):
    """
    Computes alignment profile
    """
    #max_aln_err = config.vals["err_modes"][platform]["max_aln_error"]
    aln_errors = []
    profile = [Profile() for _ in range(genome_len)]
    for aln in alignment:
        #if aln.err_rate > max_aln_err: continue
        aln_errors.append(aln.err_rate)

        qry_seq = shift_gaps(aln.trg_seq, aln.qry_seq)
        trg_seq = shift_gaps(qry_seq, aln.trg_seq)
        #qry_seq = aln.qry_seq
        #trg_seq = aln.trg_seq

        trg_pos = aln.trg_start
        for trg_nuc, qry_nuc in zip(trg_seq, qry_seq):
            if trg_nuc == "-":
                trg_pos -= 1
            if trg_pos >= genome_len:
                trg_pos -= genome_len

            prof_elem = profile[trg_pos]
            if trg_nuc == "-":
                prof_elem.insertions[qry_nuc] += 1
            else:
                prof_elem.nucl = trg_nuc
                prof_elem.matches[qry_nuc] += 1

            trg_pos += 1

    return profile, aln_errors
コード例 #2
0
def _get_bubble_seqs(alignment, platform, profile, partition, contig_info):
    """
    Given genome landmarks, forms bubble sequences
    """
    if not partition:
        return []

    #max_aln_err = cfg.vals["err_modes"][platform]["max_aln_error"]
    bubbles = []
    ext_partition = [0] + partition + [contig_info.length]
    for p_left, p_right in zip(ext_partition[:-1], ext_partition[1:]):
        bubbles.append(Bubble(contig_info.id, p_left))
        consensus = [p.nucl for p in profile[p_left:p_right]]
        bubbles[-1].consensus = "".join(consensus)

    for aln in alignment:
        #if aln.err_rate > max_aln_err: continue

        bubble_id = bisect(partition, aln.trg_start % contig_info.length)
        next_bubble_start = ext_partition[bubble_id + 1]
        chromosome_start = (bubble_id == 0
                            and not contig_info.type == "circular")
        chromosome_end = (aln.trg_end > partition[-1]
                          and not contig_info.type == "circular")

        branch_start = None
        first_segment = True
        trg_pos = aln.trg_start
        for i, trg_nuc in enumerate(aln.trg_seq):
            if trg_nuc == "-":
                continue
            if trg_pos >= contig_info.length:
                trg_pos -= contig_info.length

            if trg_pos >= next_bubble_start or trg_pos == 0:
                if not first_segment or chromosome_start:
                    branch_seq = fp.to_acgt(
                        aln.qry_seq[branch_start:i].replace("-", ""))
                    bubbles[bubble_id].branches.append(branch_seq)

                first_segment = False
                bubble_id = bisect(partition, trg_pos)
                next_bubble_start = ext_partition[bubble_id + 1]
                branch_start = i

            trg_pos += 1

        if chromosome_end:
            branch_seq = fp.to_acgt(aln.qry_seq[branch_start:].replace(
                "-", ""))
            bubbles[-1].branches.append(branch_seq)

    return bubbles
コード例 #3
0
ファイル: bubbles.py プロジェクト: sebschmi/Flye
def _get_bubble_seqs(alignment, profile, partition, contig_id):
    """
    Given genome landmarks, forms bubble sequences
    """
    if not partition or not alignment:
        return []

    ctg_len = alignment[0].trg_len

    bubbles = []
    ext_partition = [0] + partition + [ctg_len]
    for p_left, p_right in zip(ext_partition[:-1], ext_partition[1:]):
        bubbles.append(Bubble(contig_id, p_left))
        consensus = [p.nucl for p in profile[p_left:p_right]]
        bubbles[-1].consensus = "".join(consensus)

    for aln in alignment:
        bubble_id = bisect(partition, aln.trg_start)
        next_bubble_start = ext_partition[bubble_id + 1]
        chromosome_start = bubble_id == 0
        chromosome_end = aln.trg_end > partition[-1]

        branch_start = None
        first_segment = True
        trg_pos = aln.trg_start
        for i, trg_nuc in enumerate(aln.trg_seq):
            if trg_nuc == "-":
                continue
            #if trg_pos >= contig_info.length:
            #trg_pos -= contig_info.length

            if trg_pos >= next_bubble_start or trg_pos == 0:
                if not first_segment or chromosome_start:
                    branch_seq = fp.to_acgt(
                        aln.qry_seq[branch_start:i].replace("-", ""))
                    bubbles[bubble_id].branches.append(branch_seq)

                first_segment = False
                bubble_id = bisect(partition, trg_pos)
                next_bubble_start = ext_partition[bubble_id + 1]
                branch_start = i

            trg_pos += 1

        if chromosome_end:
            branch_seq = fp.to_acgt(aln.qry_seq[branch_start:].replace(
                "-", ""))
            bubbles[-1].branches.append(branch_seq)

    return bubbles
コード例 #4
0
ファイル: consensus.py プロジェクト: fenderglass/Flye
def _contig_profile(alignment, platform):
    """
    Computes alignment profile
    """

    if not alignment:
        return []

    genome_len = alignment[0].trg_len

    aln_errors = []
    profile = [Profile() for _ in range(genome_len)]
    #max_aln_err = cfg.vals["err_modes"][platform]["max_aln_error"]
    for aln in alignment:
        #if aln.err_rate > max_aln_err: continue
        aln_errors.append(aln.err_rate)

        #after gap shifting it is possible that
        #two gaps are aligned against each other
        qry_seq = shift_gaps(aln.trg_seq, aln.qry_seq)
        trg_seq = shift_gaps(qry_seq, aln.trg_seq)

        trg_pos = aln.trg_start
        for trg_nuc, qry_nuc in zip(trg_seq, qry_seq):
            if trg_nuc == "-":
                trg_pos -= 1
            if trg_pos >= genome_len:
                trg_pos -= genome_len

            #total += 1
            prof_elem = profile[trg_pos]
            if trg_nuc == "-" and qry_nuc != "-":
                prof_elem.insertions[aln.qry_id] += qry_nuc
            else:
                prof_elem.nucl = trg_nuc
                prof_elem.matches[qry_nuc] += 1

            trg_pos += 1

    #print "len", genome_len, "median coverage", cov_threshold
    #print "total bases: ", total, "discarded bases: ", discarded
    #print "filtered", float(discarded) / total
    #print ""

    return profile, aln_errors
コード例 #5
0
def _compute_profile(alignment, platform, genome_len):
    """
    Computes alignment profile
    """
    max_aln_err = cfg.vals["err_modes"][platform]["max_aln_error"]
    min_aln_len = cfg.vals["min_polish_aln_len"]
    aln_errors = []
    #filtered = 0
    profile = [ProfileInfo() for _ in range(genome_len)]
    for aln in alignment:
        if aln.err_rate > max_aln_err or len(aln.qry_seq) < min_aln_len:
            #filtered += 1
            continue

        aln_errors.append(aln.err_rate)

        qry_seq = shift_gaps(aln.trg_seq, aln.qry_seq)
        trg_seq = shift_gaps(qry_seq, aln.trg_seq)

        trg_pos = aln.trg_start
        for trg_nuc, qry_nuc in zip(trg_seq, qry_seq):
            if trg_nuc == "-":
                trg_pos -= 1
            if trg_pos >= genome_len:
                trg_pos -= genome_len

            prof_elem = profile[trg_pos]
            if trg_nuc == "-":
                prof_elem.num_inserts += 1
            else:
                prof_elem.nucl = trg_nuc
                prof_elem.coverage += 1

                if qry_nuc == "-":
                    prof_elem.num_deletions += 1
                elif trg_nuc != qry_nuc:
                    prof_elem.num_missmatch += 1

            trg_pos += 1

    #logger.debug("Filtered: {0} out of {1}".format(filtered, len(alignment)))
    return profile, aln_errors
コード例 #6
0
def _compute_profile(alignment, ref_sequence):
    """
    Computes alignment profile
    """
    if len(alignment) == 0:
        raise Exception("No alignmemnts!")
    genome_len = alignment[0].trg_len

    #max_aln_err = cfg.vals["err_modes"][platform]["max_aln_error"]
    min_aln_len = cfg.vals["min_polish_aln_len"]
    aln_errors = []
    #filtered = 0
    profile = [ProfileInfo() for _ in range(genome_len)]

    for i in range(genome_len):
        profile[i].nucl = ref_sequence[i]

    for aln in alignment:
        #if aln.err_rate > max_aln_err or len(aln.qry_seq) < min_aln_len:
        if len(aln.qry_seq) < min_aln_len:
            #filtered += 1
            continue

        aln_errors.append(aln.err_rate)

        qry_seq = shift_gaps(aln.trg_seq, aln.qry_seq)
        trg_seq = shift_gaps(qry_seq, aln.trg_seq)

        trg_pos = aln.trg_start
        for trg_nuc, qry_nuc in zip(trg_seq, qry_seq):
            if trg_nuc == "-":
                trg_pos -= 1
            #if trg_pos >= genome_len:
            #    trg_pos -= genome_len

            prof_elem = profile[trg_pos]
            if trg_nuc == "-":
                prof_elem.insertions[aln.qry_id] += qry_nuc
                #prof_elem.num_inserts += 1
            else:
                #prof_elem.nucl = trg_nuc
                prof_elem.coverage += 1

                if qry_nuc == "-":
                    prof_elem.num_deletions += 1
                elif trg_nuc != qry_nuc:
                    prof_elem.num_missmatch += 1

            trg_pos += 1

    for i in range(genome_len):
        for ins_read, ins_str in profile[i].insertions.items():
            profile[i].propagated_ins += 1
            span = len(ins_str)
            for j in range(max(0, i - span), i):
                profile[j].propagated_ins += 1
            for j in range(i + 1, min(i + span + 1, genome_len)):
                profile[j].propagated_ins += 1

    #logger.debug("Filtered: {0} out of {1}".format(filtered, len(alignment)))
    return profile, aln_errors
コード例 #7
0
def get_simple_repeats(repeat_graph, alignments_file, edge_seqs):
    next_path_id = 1
    path_ids = {}
    repeats_dict = {}
    MULT = 2

    paths_to_resolve = []
    interesting_edges = set()
    for path in repeat_graph.get_unbranching_paths():
        if not path[0].repetitive or path[0].self_complement:
            continue

        is_simple = True
        inputs = set()
        for in_edge in path[0].node_left.in_edges:
            inputs.add(in_edge.edge_id)
            if in_edge.repetitive:
                is_simple = False

        outputs = set()
        for out_edge in path[-1].node_right.out_edges:
            outputs.add(out_edge.edge_id)
            if out_edge.repetitive:
                is_simple = False

        if not is_simple or len(inputs) != MULT or len(outputs) != MULT:
            continue

        paths_to_resolve.append((path, inputs, outputs))
        interesting_edges.update(set([e.edge_id for e in path]))

    interesting_alignments = []
    for read_aln in iter_alignments(alignments_file):
        repeat_read = False
        for edge_aln in read_aln:
            if edge_aln.edge_id in interesting_edges:
                repeat_read = True
        if repeat_read:
            interesting_alignments.append(read_aln)

    for path, inputs, outputs in paths_to_resolve:
        if path[0].edge_id not in path_ids:
            path_ids[path[0].edge_id] = next_path_id
            path_ids[-path[-1].edge_id] = -next_path_id
            next_path_id += 1
        path_id = path_ids[path[0].edge_id]

        repeat_edge_ids = set([e.edge_id for e in path])
        inner_reads = []
        input_reads = defaultdict(list)
        output_reads = defaultdict(list)
        for read_aln in interesting_alignments:
            repeat_read = False
            for edge_aln in read_aln:
                if edge_aln.edge_id in repeat_edge_ids:
                    repeat_read = True
            if not repeat_read:
                continue

            inner_reads.append(read_aln[0].overlap.cur_id)
            for prev_edge, next_edge in zip(read_aln[:-1], read_aln[1:]):
                if (prev_edge.edge_id in inputs
                        and next_edge.edge_id == path[0].edge_id):
                    input_reads[prev_edge.edge_id].append(
                        prev_edge.overlap.cur_id)

                if (prev_edge.edge_id == path[-1].edge_id
                        and next_edge.edge_id in outputs):
                    output_reads[next_edge.edge_id].append(
                        next_edge.overlap.cur_id)

        if (not len(inner_reads) or len(input_reads) != MULT
                or len(output_reads) != MULT):
            continue

        #add edges sequences:
        sequences = {}
        for edge in chain(input_reads, output_reads):
            seq_id = repeat_graph.edges[edge].edge_sequences[0].edge_seq_name
            seq = edge_seqs[seq_id[1:]]
            if seq_id[0] == "-":
                seq = fp.reverse_complement(seq)
            sequences[edge] = seq

        template_seq = ""
        for edge in path:
            seq_id = edge.edge_sequences[0].edge_seq_name
            seq = edge_seqs[seq_id[1:]]
            if seq_id[0] == "-":
                seq = fp.reverse_complement(seq)
            template_seq += seq
        sequences["template"] = template_seq

        #print path_id
        #for h, s in sequences.items():
        #    print h, s[:100]

        repeats_dict[path_id] = RepeatInfo(path_id, [e.edge_id for e in path],
                                           inner_reads, input_reads,
                                           output_reads, sequences, MULT)

    return repeats_dict