def realign_filter(rec, inslib): S = -np.ones((256, 256)) + 2 * np.identity(256) S = S.astype(np.int16) seqn = rec['Superfamily'] + ':' + rec['Subfamily'] if seqn not in inslib: return False seq_headers = [ 'Genomic_Consensus_5p', 'Genomic_Consensus_3p', 'Insert_Consensus_5p', 'Insert_Consensus_3p' ] for seqtype in seq_headers: s1 = align.string_to_alignment(rec[seqtype]) s2 = align.string_to_alignment(inslib[seqn]) (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True) a1 = align.alignment_to_string(a1) a2 = ''.join( [b for b in list(align.alignment_to_string(a2)) if b != '-']) score = 0.0 if len(a1) > 0: score = float(len(a1) - (len(a1) - s)) / float(len(a1)) #print seqtype, score, len(a1) if score > 0.9 and len(a1) > 25: return False return True
def realign_filter(rec, inslib): S = -np.ones((256, 256)) + 2 * np.identity(256) S = S.astype(np.int16) seqn = rec['Superfamily'] + ':' + rec['Subfamily'] if seqn not in inslib: return False seq_headers = ['Genomic_Consensus_5p', 'Genomic_Consensus_3p', 'Insert_Consensus_5p', 'Insert_Consensus_3p'] for seqtype in seq_headers: s1 = align.string_to_alignment(rec[seqtype]) s2 = align.string_to_alignment(inslib[seqn]) (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True) a1 = align.alignment_to_string(a1) a2 = ''.join([b for b in list(align.alignment_to_string(a2)) if b != '-']) score = 0.0 if len(a1) > 0: score = float(len(a1) - (len(a1)-s)) / float(len(a1)) #print seqtype, score, len(a1) if score > 0.9 and len(a1) > 25: return False return True
def consensus(self, minscore = 0.9): ''' build consensus from sorted aligned reads iteratively ''' S = -np.ones((256, 256)) + 2 * np.identity(256) S = S.astype(np.int16) minqual = self.reads[0].minqual sortable_reads = [SortableRead(sr.read) for sr in self.reads] seqs = [qualtrim(sorted_read.read, minqual=minqual) for sorted_read in sorted(sortable_reads)] seqs = [s for s in seqs if len(s) > 20] if len(seqs) == 0: return '', 0.0 if len(seqs) == 1: # no consensus necessary return seqs[0], 1.0 uniq_seqs = [seqs[0]] for i, seq in enumerate(seqs[1:], start=1): if seq != seqs[i-1]: uniq_seqs.append(seq) if len(uniq_seqs) == 1: # all seqs were the same! return uniq_seqs[0], 1.0 cons = uniq_seqs[0] scores = [] if len(uniq_seqs) > 1000: uniq_seqs = [uniq_seqs[u] for u in sorted(np.random.choice(range(len(uniq_seqs)), size=1000))] for seq in uniq_seqs[1:]: s1 = align.string_to_alignment(cons) s2 = align.string_to_alignment(seq) (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True) a1 = align.alignment_to_string(a1) a2 = ''.join([b for b in list(align.alignment_to_string(a2)) if b != '-']) score = 0.0 if len(a1) > 0: score = float(len(a1) - (len(a1)-s)) / float(len(a1)) if re.search(a1, cons): cons_start, cons_end = locate_subseq(cons, a1) if score >= minscore and cons_end > len(cons)-5: scores.append(score) align_end = locate_subseq(seq, a2)[1] cons += seq[align_end:] #print self.start, self.end, cons if scores: return cons, np.mean(scores) else: return cons, 0.0
def align_shortlist(seq, shortlist): result = [] nseq = list(align.string_to_alignment(seq)) for sh in shortlist: score, nr, shr = align.align(nseq, list(align.string_to_alignment(sh)), -1, -1, S, True, True) result.append((align.alignment_to_string(nr), align.alignment_to_string(shr))) return result
def local_check(*zipped): for x, y in list(*zipped): nx = list(align.string_to_alignment(x.replace("-", ""))) ny = list(align.string_to_alignment(y)) score, xr, yr = align.align(nx, ny, -1, -1, S, True, True) xr = align.alignment_to_string(xr) yr = align.alignment_to_string(yr) # diff = abs(len(y.replace("-", "")) - len(xr.replace("-", ""))) # if diff > 1: # return False if hamming_dist(y, xr) > 1: # print(y, xr, yr, hamming_dist(y, xr)) return False return True
def align_test(s, m): S = np.array([[1 if i == j else -1 for i in range(256)] for j in range(256)], dtype=np.short) score, p, _ = align.align(list(align.string_to_alignment(s)), list(align.string_to_alignment(m)), -1, -1, S, True, True) p = align.alignment_to_string(p).replace('-', '') return score, s.find(p)
def align(self, seq1, seq2, local = False): s1 = align.string_to_alignment(seq1) s2 = align.string_to_alignment(seq2) score, a1, a2 = align.align(s1, s2, self.gap_open, self.gap_extend, self.subs, local) res1, res2 = align.alignment_to_string(a1), align.alignment_to_string(a2) if local: strip1, strip2 = res1.replace("-", ""), res2.replace("-", "") start1, start2 = seq1.index(strip1), seq2.index(strip2) start_flank = max(start1, start2) end_flank = max(len(seq1) - len(strip1) - start1, len(seq2) - len(strip2) - start2) res1 = "-" * start_flank + res1 + "-" * end_flank res2 = "-" * start_flank + res2 + "-" * end_flank return res1, res2, score
def align(self, seq1, seq2, local=False): s1 = align.string_to_alignment(seq1) s2 = align.string_to_alignment(seq2) score, a1, a2 = align.align(s1, s2, self.gap_open, self.gap_extend, self.subs, local) res1, res2 = align.alignment_to_string(a1), align.alignment_to_string( a2) if local: strip1, strip2 = res1.replace("-", ""), res2.replace("-", "") start1, start2 = seq1.index(strip1), seq2.index(strip2) start_flank = max(start1, start2) end_flank = max(len(seq1) - len(strip1) - start1, len(seq2) - len(strip2) - start2) res1 = "-" * start_flank + res1 + "-" * end_flank res2 = "-" * start_flank + res2 + "-" * end_flank return res1, res2, score
def consensus(seqs, minscore=0.95): ''' build consensus from sorted aligned reads iteratively, expects seqs to be sorted in ref genome order ''' S = -np.ones((256, 256)) + 2 * np.identity(256) S = S.astype(np.int16) if len(seqs) == 1: # no consensus necessary return seqs[0], 1.0 uniq_seqs = [seqs[0]] for i, seq in enumerate(seqs[1:], start=1): if seq != seqs[i-1]: uniq_seqs.append(seq) if len(uniq_seqs) == 1: # all seqs were the same! return uniq_seqs[0], 1.0 cons = uniq_seqs[0] scores = [] if len(uniq_seqs) > 1000: uniq_seqs = np.random.choice(uniq_seqs, size=1000) for seq in uniq_seqs[1:]: s1 = align.string_to_alignment(cons) s2 = align.string_to_alignment(seq) (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True) a1 = align.alignment_to_string(a1) a2 = ''.join([b for b in list(align.alignment_to_string(a2)) if b != '-']) score = float(len(a1) - (len(a1)-s)) / float(len(a1)) scores.append(score) if re.search(a1, cons): cons_start, cons_end = locate_subseq(cons, a1) if score >= minscore and cons_end > len(cons)-5: align_end = locate_subseq(seq, a2)[1] cons += seq[align_end:] return cons, np.mean(scores)
def consensus(seqs, minscore=0.95): ''' build consensus from sorted aligned reads iteratively, expects seqs to be sorted in ref genome order ''' S = -np.ones((256, 256)) + 2 * np.identity(256) S = S.astype(np.int16) if len(seqs) == 0: return '', 0.0 if len(seqs) == 1: # no consensus necessary return seqs[0], 1.0 uniq_seqs = [seqs[0]] for i, seq in enumerate(seqs[1:], start=1): if seq != seqs[i - 1]: uniq_seqs.append(seq) if len(uniq_seqs) == 1: # all seqs were the same! return uniq_seqs[0], 1.0 start_index = 0 cons = uniq_seqs[start_index] scores = [] align_init = False for i, seq in enumerate(uniq_seqs[1:]): #print 'oldcons:', cons #print 'seq :', seq s1 = align.string_to_alignment(cons) s2 = align.string_to_alignment(seq) (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True) a1 = align.alignment_to_string(a1) a2 = ''.join( [b for b in list(align.alignment_to_string(a2)) if b != '-']) score = 0.0 if len(a1) > 0: score = float(len(a1) - (len(a1) - s)) / float(len(a1)) #print 'score :', score scores.append(score) if re.search(a1, cons): cons_start, cons_end = locate_subseq(cons, a1) if score >= minscore and cons_end > len(cons) - 5: align_end = locate_subseq(seq, a2)[1] cons += seq[align_end:] align_init = True #print 'newcons:', cons elif not align_init: # haven't found a scaffold yet start_index += 1 cons = uniq_seqs[start_index] #print '****' return cons, np.mean(scores)
def consensus(self, minscore=0.9): ''' build consensus from sorted aligned reads iteratively ''' S = -np.ones((256, 256)) + 2 * np.identity(256) S = S.astype(np.int16) minqual = self.reads[0].minqual sortable_reads = [SortableRead(sr.read) for sr in self.reads] seqs = [ qualtrim(sorted_read.read, minqual=minqual) for sorted_read in sorted(sortable_reads) ] seqs = [s for s in seqs if len(s) > 20] if len(seqs) == 0: return '', 0.0 if len(seqs) == 1: # no consensus necessary return seqs[0], 1.0 uniq_seqs = [seqs[0]] for i, seq in enumerate(seqs[1:], start=1): if seq != seqs[i - 1]: uniq_seqs.append(seq) if len(uniq_seqs) == 1: # all seqs were the same! return uniq_seqs[0], 1.0 cons = uniq_seqs[0] scores = [] if len(uniq_seqs) > 1000: uniq_seqs = [ uniq_seqs[u] for u in sorted( np.random.choice(range(len(uniq_seqs)), size=1000)) ] for seq in uniq_seqs[1:]: s1 = align.string_to_alignment(cons) s2 = align.string_to_alignment(seq) (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True) a1 = align.alignment_to_string(a1) a2 = ''.join( [b for b in list(align.alignment_to_string(a2)) if b != '-']) score = 0.0 if len(a1) > 0: score = float(len(a1) - (len(a1) - s)) / float(len(a1)) if re.search(a1, cons): cons_start, cons_end = locate_subseq(cons, a1) if score >= minscore and cons_end > len(cons) - 5: scores.append(score) align_end = locate_subseq(seq, a2)[1] cons += seq[align_end:] #print self.start, self.end, cons if scores: return cons, np.mean(scores) else: return cons, 0.0
def consensus(seqs, minscore=0.95): ''' build consensus from sorted aligned reads iteratively, expects seqs to be sorted in ref genome order ''' S = -np.ones((256, 256)) + 2 * np.identity(256) S = S.astype(np.int16) if len(seqs) == 0: return '', 0.0 if len(seqs) == 1: # no consensus necessary return seqs[0], 1.0 uniq_seqs = [seqs[0]] for i, seq in enumerate(seqs[1:], start=1): if seq != seqs[i-1]: uniq_seqs.append(seq) if len(uniq_seqs) == 1: # all seqs were the same! return uniq_seqs[0], 1.0 start_index = 0 cons = uniq_seqs[start_index] scores = [] align_init = False for i, seq in enumerate(uniq_seqs[1:]): #print 'oldcons:', cons #print 'seq :', seq s1 = align.string_to_alignment(cons) s2 = align.string_to_alignment(seq) (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True) a1 = align.alignment_to_string(a1) a2 = ''.join([b for b in list(align.alignment_to_string(a2)) if b != '-']) score = 0.0 if len(a1) > 0: score = float(len(a1) - (len(a1)-s)) / float(len(a1)) #print 'score :', score scores.append(score) if re.search(a1, cons): cons_start, cons_end = locate_subseq(cons, a1) if score >= minscore and cons_end > len(cons)-5: align_end = locate_subseq(seq, a2)[1] cons += seq[align_end:] align_init = True #print 'newcons:', cons elif not align_init: # haven't found a scaffold yet start_index += 1 cons = uniq_seqs[start_index] #print '****' return cons, np.mean(scores)