def get_alignment_parasail(reference_genome, input_genome):

    # the dna full matrix supports ambiguity codes, although "N"s are not given free mismatches as we might like
    # the alignments appear good enough for our purpose however
    result = parasail.nw_trace_striped_32(input_genome.sequence,
                                          reference_genome.sequence, 10, 1,
                                          parasail.dnafull)
    traceback = result.traceback

    return traceback.ref, traceback.comp, traceback.query
Beispiel #2
0
def build_envelope(len1, seq1, path1, len2, seq2, path2, padding=15):

    # needleman-wunsch alignment with constant gap penalty.
    aln = parasail.nw_trace_striped_32(seq2, seq1, 2, 2, parasail.dnafull)

    # pair up positions
    alignment = np.column_stack([
        np.cumsum([x != '-' for x in aln.traceback.ref]) - 1,
        np.cumsum([x != '-' for x in aln.traceback.query]) - 1
    ])

    path_range1 = np.column_stack([path1, path1[1:] + [len1]])
    path_range2 = np.column_stack([path2, path2[1:] + [len2]])

    envelope = np.full((len1, 2), -1, dtype=int)

    for idx1, idx2 in alignment.clip(0):

        st_1, en_1 = path_range1[idx1]
        st_2, en_2 = path_range2[idx2]

        for idx in range(st_1, en_1):
            if st_2 < envelope[idx, 0] or envelope[idx, 0] < 0:
                envelope[idx, 0] = st_2
            if en_2 > envelope[idx, 1] or envelope[idx, 1] < 0:
                envelope[idx, 1] = en_2

    # add a little padding to ensure some overlap
    envelope[:, 0] = envelope[:, 0] - padding
    envelope[:, 1] = envelope[:, 1] + padding
    envelope = np.clip(envelope, 0, len2)

    prev_end = 0
    for i in range(envelope.shape[0]):

        if envelope[i, 0] > envelope[i, 1]:
            envelope[i, 0] = 0

        if envelope[i, 0] > prev_end:
            envelope[i, 0] = prev_end

        prev_end = envelope[i, 1]

    return envelope.astype(np.uint64)
Beispiel #3
0
def get_alignment(reference_genome, input_genome):

    # the dna full matrix supports ambiguity codes, although "N"s are not given free mismatches as we might like
    # the alignments appear good enough for our purpose however
    result = parasail.nw_trace_striped_32(input_genome.sequence,
                                          reference_genome.sequence, 10, 1,
                                          parasail.dnafull)
    traceback = result.traceback
    columns = 120

    position_map = list()

    reference_index = 0
    input_index = 0

    for (ref, query) in zip(traceback.ref, traceback.query):
        if ref != '-' and query != '-':
            position_map.append((reference_index, input_index))
        if ref != '-':
            reference_index += 1
        if query != '-':
            input_index += 1

    return position_map