Exemple #1
0
def realign_filter(rec, inslib):
    S = -np.ones((256, 256)) + 2 * np.identity(256)
    S = S.astype(np.int16)

    seqn = rec['Superfamily'] + ':' + rec['Subfamily']
    if seqn not in inslib:
        return False

    seq_headers = [
        'Genomic_Consensus_5p', 'Genomic_Consensus_3p', 'Insert_Consensus_5p',
        'Insert_Consensus_3p'
    ]

    for seqtype in seq_headers:
        s1 = align.string_to_alignment(rec[seqtype])
        s2 = align.string_to_alignment(inslib[seqn])

        (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True)
        a1 = align.alignment_to_string(a1)
        a2 = ''.join(
            [b for b in list(align.alignment_to_string(a2)) if b != '-'])

        score = 0.0
        if len(a1) > 0:
            score = float(len(a1) - (len(a1) - s)) / float(len(a1))

        #print seqtype, score, len(a1)

        if score > 0.9 and len(a1) > 25:
            return False

        return True
Exemple #2
0
def realign_filter(rec, inslib):
    S = -np.ones((256, 256)) + 2 * np.identity(256)
    S = S.astype(np.int16)

    seqn = rec['Superfamily'] + ':' + rec['Subfamily']
    if seqn not in inslib:
        return False

    seq_headers = ['Genomic_Consensus_5p', 'Genomic_Consensus_3p', 'Insert_Consensus_5p', 'Insert_Consensus_3p']

    for seqtype in seq_headers:
        s1 = align.string_to_alignment(rec[seqtype])
        s2 = align.string_to_alignment(inslib[seqn])

        (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True)
        a1 = align.alignment_to_string(a1)
        a2 = ''.join([b for b in list(align.alignment_to_string(a2)) if b != '-'])

        score = 0.0
        if len(a1) > 0:
            score = float(len(a1) - (len(a1)-s)) / float(len(a1))

        #print seqtype, score, len(a1)

        if score > 0.9 and len(a1) > 25:
            return False

        return True
Exemple #3
0
    def consensus(self, minscore = 0.9):
        ''' build consensus from sorted aligned reads iteratively '''

        S = -np.ones((256, 256)) + 2 * np.identity(256)
        S = S.astype(np.int16)

        minqual = self.reads[0].minqual

        sortable_reads = [SortableRead(sr.read) for sr in self.reads]
        seqs = [qualtrim(sorted_read.read, minqual=minqual) for sorted_read in sorted(sortable_reads)]
        seqs = [s for s in seqs if len(s) > 20]

        if len(seqs) == 0:
            return '', 0.0

        if len(seqs) == 1: # no consensus necessary
            return seqs[0], 1.0

        uniq_seqs = [seqs[0]]
        for i, seq in enumerate(seqs[1:], start=1):
            if seq != seqs[i-1]:
                uniq_seqs.append(seq)

        if len(uniq_seqs) == 1: # all seqs were the same!
            return uniq_seqs[0], 1.0

        cons = uniq_seqs[0]
        scores = []

        if len(uniq_seqs) > 1000:
            uniq_seqs = [uniq_seqs[u] for u in sorted(np.random.choice(range(len(uniq_seqs)), size=1000))]

        for seq in uniq_seqs[1:]:

            s1 = align.string_to_alignment(cons)
            s2 = align.string_to_alignment(seq)

            (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True)
            a1 = align.alignment_to_string(a1)
            a2 = ''.join([b for b in list(align.alignment_to_string(a2)) if b != '-'])

            score = 0.0
            if len(a1) > 0:
                score = float(len(a1) - (len(a1)-s)) / float(len(a1))

            if re.search(a1, cons):
                cons_start, cons_end = locate_subseq(cons, a1)

                if score >= minscore and cons_end > len(cons)-5:
                    scores.append(score)
                    align_end = locate_subseq(seq, a2)[1]
                    cons += seq[align_end:]
                    #print self.start, self.end, cons

        if scores:
            return cons, np.mean(scores)

        else:
            return cons, 0.0
def align_shortlist(seq, shortlist):
    result = []

    nseq = list(align.string_to_alignment(seq))
    for sh in shortlist:
        score, nr, shr = align.align(nseq, list(align.string_to_alignment(sh)),
                                     -1, -1, S, True, True)
        result.append((align.alignment_to_string(nr),
                       align.alignment_to_string(shr)))
    return result
def local_check(*zipped):
    for x, y in list(*zipped):
        nx = list(align.string_to_alignment(x.replace("-", "")))
        ny = list(align.string_to_alignment(y))
        score, xr, yr = align.align(nx, ny, -1, -1, S, True, True)
        xr = align.alignment_to_string(xr)
        yr = align.alignment_to_string(yr)
        # diff = abs(len(y.replace("-", "")) - len(xr.replace("-", "")))
        # if diff > 1:
        #     return False
        if hamming_dist(y, xr) > 1:
            # print(y, xr, yr, hamming_dist(y, xr))
            return False
    return True
def align_test(s, m):
    S = np.array([[1 if i == j else -1 for i in range(256)] for j in range(256)], dtype=np.short)
    score, p, _ = align.align(list(align.string_to_alignment(s)),
                              list(align.string_to_alignment(m)),
                              -1, -1, S, True, True)
    p = align.alignment_to_string(p).replace('-', '')
    return score, s.find(p)
    def align(self, seq1, seq2, local = False):
        s1 = align.string_to_alignment(seq1)
        s2 = align.string_to_alignment(seq2)
        score, a1, a2 = align.align(s1, s2, self.gap_open, self.gap_extend,
                                    self.subs, local)
        res1, res2 = align.alignment_to_string(a1), align.alignment_to_string(a2)

        if local:
            strip1, strip2 = res1.replace("-", ""), res2.replace("-", "")
            start1, start2 = seq1.index(strip1), seq2.index(strip2)
            start_flank = max(start1, start2)
            end_flank = max(len(seq1) - len(strip1) - start1,
                            len(seq2) - len(strip2) - start2)
            res1 = "-" * start_flank + res1 + "-" * end_flank
            res2 = "-" * start_flank + res2 + "-" * end_flank
        return res1, res2, score
    def align(self, seq1, seq2, local=False):
        s1 = align.string_to_alignment(seq1)
        s2 = align.string_to_alignment(seq2)
        score, a1, a2 = align.align(s1, s2, self.gap_open, self.gap_extend,
                                    self.subs, local)
        res1, res2 = align.alignment_to_string(a1), align.alignment_to_string(
            a2)

        if local:
            strip1, strip2 = res1.replace("-", ""), res2.replace("-", "")
            start1, start2 = seq1.index(strip1), seq2.index(strip2)
            start_flank = max(start1, start2)
            end_flank = max(len(seq1) - len(strip1) - start1,
                            len(seq2) - len(strip2) - start2)
            res1 = "-" * start_flank + res1 + "-" * end_flank
            res2 = "-" * start_flank + res2 + "-" * end_flank
        return res1, res2, score
Exemple #9
0
def consensus(seqs, minscore=0.95):
    ''' build consensus from sorted aligned reads iteratively, expects seqs to be sorted in ref genome order '''

    S = -np.ones((256, 256)) + 2 * np.identity(256)
    S = S.astype(np.int16)

    if len(seqs) == 1: # no consensus necessary
        return seqs[0], 1.0

    uniq_seqs = [seqs[0]]
    for i, seq in enumerate(seqs[1:], start=1):
        if seq != seqs[i-1]:
            uniq_seqs.append(seq)

    if len(uniq_seqs) == 1: # all seqs were the same!
        return uniq_seqs[0], 1.0

    cons = uniq_seqs[0]
    scores = []

    if len(uniq_seqs) > 1000: uniq_seqs = np.random.choice(uniq_seqs, size=1000)

    for seq in uniq_seqs[1:]:

        s1 = align.string_to_alignment(cons)
        s2 = align.string_to_alignment(seq)

        (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True)
        a1 = align.alignment_to_string(a1)
        a2 = ''.join([b for b in list(align.alignment_to_string(a2)) if b != '-'])

        score = float(len(a1) - (len(a1)-s)) / float(len(a1))
        scores.append(score)

        if re.search(a1, cons):
            cons_start, cons_end = locate_subseq(cons, a1)

            if score >= minscore and cons_end > len(cons)-5:
                align_end = locate_subseq(seq, a2)[1]
                cons += seq[align_end:]

    return cons, np.mean(scores)
Exemple #10
0
def consensus(seqs, minscore=0.95):
    ''' build consensus from sorted aligned reads iteratively, expects seqs to be sorted in ref genome order '''

    S = -np.ones((256, 256)) + 2 * np.identity(256)
    S = S.astype(np.int16)

    if len(seqs) == 0:
        return '', 0.0

    if len(seqs) == 1:  # no consensus necessary
        return seqs[0], 1.0

    uniq_seqs = [seqs[0]]
    for i, seq in enumerate(seqs[1:], start=1):
        if seq != seqs[i - 1]:
            uniq_seqs.append(seq)

    if len(uniq_seqs) == 1:  # all seqs were the same!
        return uniq_seqs[0], 1.0

    start_index = 0
    cons = uniq_seqs[start_index]
    scores = []

    align_init = False

    for i, seq in enumerate(uniq_seqs[1:]):

        #print 'oldcons:', cons
        #print 'seq    :', seq

        s1 = align.string_to_alignment(cons)
        s2 = align.string_to_alignment(seq)

        (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True)
        a1 = align.alignment_to_string(a1)
        a2 = ''.join(
            [b for b in list(align.alignment_to_string(a2)) if b != '-'])

        score = 0.0

        if len(a1) > 0:
            score = float(len(a1) - (len(a1) - s)) / float(len(a1))

        #print 'score  :', score

        scores.append(score)

        if re.search(a1, cons):
            cons_start, cons_end = locate_subseq(cons, a1)

            if score >= minscore and cons_end > len(cons) - 5:
                align_end = locate_subseq(seq, a2)[1]
                cons += seq[align_end:]
                align_init = True
                #print 'newcons:', cons

            elif not align_init:  # haven't found a scaffold yet
                start_index += 1
                cons = uniq_seqs[start_index]

        #print '****'

    return cons, np.mean(scores)
Exemple #11
0
    def consensus(self, minscore=0.9):
        ''' build consensus from sorted aligned reads iteratively '''

        S = -np.ones((256, 256)) + 2 * np.identity(256)
        S = S.astype(np.int16)

        minqual = self.reads[0].minqual

        sortable_reads = [SortableRead(sr.read) for sr in self.reads]
        seqs = [
            qualtrim(sorted_read.read, minqual=minqual)
            for sorted_read in sorted(sortable_reads)
        ]
        seqs = [s for s in seqs if len(s) > 20]

        if len(seqs) == 0:
            return '', 0.0

        if len(seqs) == 1:  # no consensus necessary
            return seqs[0], 1.0

        uniq_seqs = [seqs[0]]
        for i, seq in enumerate(seqs[1:], start=1):
            if seq != seqs[i - 1]:
                uniq_seqs.append(seq)

        if len(uniq_seqs) == 1:  # all seqs were the same!
            return uniq_seqs[0], 1.0

        cons = uniq_seqs[0]
        scores = []

        if len(uniq_seqs) > 1000:
            uniq_seqs = [
                uniq_seqs[u] for u in sorted(
                    np.random.choice(range(len(uniq_seqs)), size=1000))
            ]

        for seq in uniq_seqs[1:]:

            s1 = align.string_to_alignment(cons)
            s2 = align.string_to_alignment(seq)

            (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True)
            a1 = align.alignment_to_string(a1)
            a2 = ''.join(
                [b for b in list(align.alignment_to_string(a2)) if b != '-'])

            score = 0.0
            if len(a1) > 0:
                score = float(len(a1) - (len(a1) - s)) / float(len(a1))

            if re.search(a1, cons):
                cons_start, cons_end = locate_subseq(cons, a1)

                if score >= minscore and cons_end > len(cons) - 5:
                    scores.append(score)
                    align_end = locate_subseq(seq, a2)[1]
                    cons += seq[align_end:]
                    #print self.start, self.end, cons

        if scores:
            return cons, np.mean(scores)

        else:
            return cons, 0.0
Exemple #12
0
def consensus(seqs, minscore=0.95):
    ''' build consensus from sorted aligned reads iteratively, expects seqs to be sorted in ref genome order '''

    S = -np.ones((256, 256)) + 2 * np.identity(256)
    S = S.astype(np.int16)

    if len(seqs) == 0:
        return '', 0.0

    if len(seqs) == 1: # no consensus necessary
        return seqs[0], 1.0

    uniq_seqs = [seqs[0]]
    for i, seq in enumerate(seqs[1:], start=1):
        if seq != seqs[i-1]:
            uniq_seqs.append(seq)

    if len(uniq_seqs) == 1: # all seqs were the same!
        return uniq_seqs[0], 1.0

    start_index = 0
    cons = uniq_seqs[start_index]
    scores = []

    align_init = False

    for i, seq in enumerate(uniq_seqs[1:]):

        #print 'oldcons:', cons
        #print 'seq    :', seq

        s1 = align.string_to_alignment(cons)
        s2 = align.string_to_alignment(seq)

        (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True)
        a1 = align.alignment_to_string(a1)
        a2 = ''.join([b for b in list(align.alignment_to_string(a2)) if b != '-'])


        score = 0.0

        if len(a1) > 0:
            score = float(len(a1) - (len(a1)-s)) / float(len(a1))

        #print 'score  :', score

        scores.append(score)

        if re.search(a1, cons):
            cons_start, cons_end = locate_subseq(cons, a1)

            if score >= minscore and cons_end > len(cons)-5:
                align_end = locate_subseq(seq, a2)[1]
                cons += seq[align_end:]
                align_init = True
                #print 'newcons:', cons

            elif not align_init: # haven't found a scaffold yet
                start_index += 1
                cons = uniq_seqs[start_index]

        #print '****'

    return cons, np.mean(scores)