def realign_filter(rec, inslib): S = -np.ones((256, 256)) + 2 * np.identity(256) S = S.astype(np.int16) seqn = rec['Superfamily'] + ':' + rec['Subfamily'] if seqn not in inslib: return False seq_headers = ['Genomic_Consensus_5p', 'Genomic_Consensus_3p', 'Insert_Consensus_5p', 'Insert_Consensus_3p'] for seqtype in seq_headers: s1 = align.string_to_alignment(rec[seqtype]) s2 = align.string_to_alignment(inslib[seqn]) (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True) a1 = align.alignment_to_string(a1) a2 = ''.join([b for b in list(align.alignment_to_string(a2)) if b != '-']) score = 0.0 if len(a1) > 0: score = float(len(a1) - (len(a1)-s)) / float(len(a1)) #print seqtype, score, len(a1) if score > 0.9 and len(a1) > 25: return False return True
def align_test(s, m): S = np.array([[1 if i == j else -1 for i in range(256)] for j in range(256)], dtype=np.short) score, p, _ = align.align(list(align.string_to_alignment(s)), list(align.string_to_alignment(m)), -1, -1, S, True, True) p = align.alignment_to_string(p).replace('-', '') return score, s.find(p)
def realign_filter(rec, inslib): S = -np.ones((256, 256)) + 2 * np.identity(256) S = S.astype(np.int16) seqn = rec['Superfamily'] + ':' + rec['Subfamily'] if seqn not in inslib: return False seq_headers = [ 'Genomic_Consensus_5p', 'Genomic_Consensus_3p', 'Insert_Consensus_5p', 'Insert_Consensus_3p' ] for seqtype in seq_headers: s1 = align.string_to_alignment(rec[seqtype]) s2 = align.string_to_alignment(inslib[seqn]) (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True) a1 = align.alignment_to_string(a1) a2 = ''.join( [b for b in list(align.alignment_to_string(a2)) if b != '-']) score = 0.0 if len(a1) > 0: score = float(len(a1) - (len(a1) - s)) / float(len(a1)) #print seqtype, score, len(a1) if score > 0.9 and len(a1) > 25: return False return True
def consensus(self, minscore = 0.9): ''' build consensus from sorted aligned reads iteratively ''' S = -np.ones((256, 256)) + 2 * np.identity(256) S = S.astype(np.int16) minqual = self.reads[0].minqual sortable_reads = [SortableRead(sr.read) for sr in self.reads] seqs = [qualtrim(sorted_read.read, minqual=minqual) for sorted_read in sorted(sortable_reads)] seqs = [s for s in seqs if len(s) > 20] if len(seqs) == 0: return '', 0.0 if len(seqs) == 1: # no consensus necessary return seqs[0], 1.0 uniq_seqs = [seqs[0]] for i, seq in enumerate(seqs[1:], start=1): if seq != seqs[i-1]: uniq_seqs.append(seq) if len(uniq_seqs) == 1: # all seqs were the same! return uniq_seqs[0], 1.0 cons = uniq_seqs[0] scores = [] if len(uniq_seqs) > 1000: uniq_seqs = [uniq_seqs[u] for u in sorted(np.random.choice(range(len(uniq_seqs)), size=1000))] for seq in uniq_seqs[1:]: s1 = align.string_to_alignment(cons) s2 = align.string_to_alignment(seq) (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True) a1 = align.alignment_to_string(a1) a2 = ''.join([b for b in list(align.alignment_to_string(a2)) if b != '-']) score = 0.0 if len(a1) > 0: score = float(len(a1) - (len(a1)-s)) / float(len(a1)) if re.search(a1, cons): cons_start, cons_end = locate_subseq(cons, a1) if score >= minscore and cons_end > len(cons)-5: scores.append(score) align_end = locate_subseq(seq, a2)[1] cons += seq[align_end:] #print self.start, self.end, cons if scores: return cons, np.mean(scores) else: return cons, 0.0
def align_shortlist(seq, shortlist): result = [] nseq = list(align.string_to_alignment(seq)) for sh in shortlist: score, nr, shr = align.align(nseq, list(align.string_to_alignment(sh)), -1, -1, S, True, True) result.append((align.alignment_to_string(nr), align.alignment_to_string(shr))) return result
def local_check(*zipped): for x, y in list(*zipped): nx = list(align.string_to_alignment(x.replace("-", ""))) ny = list(align.string_to_alignment(y)) score, xr, yr = align.align(nx, ny, -1, -1, S, True, True) xr = align.alignment_to_string(xr) yr = align.alignment_to_string(yr) # diff = abs(len(y.replace("-", "")) - len(xr.replace("-", ""))) # if diff > 1: # return False if hamming_dist(y, xr) > 1: # print(y, xr, yr, hamming_dist(y, xr)) return False return True
def align(self, seq1, seq2, local = False): s1 = align.string_to_alignment(seq1) s2 = align.string_to_alignment(seq2) score, a1, a2 = align.align(s1, s2, self.gap_open, self.gap_extend, self.subs, local) res1, res2 = align.alignment_to_string(a1), align.alignment_to_string(a2) if local: strip1, strip2 = res1.replace("-", ""), res2.replace("-", "") start1, start2 = seq1.index(strip1), seq2.index(strip2) start_flank = max(start1, start2) end_flank = max(len(seq1) - len(strip1) - start1, len(seq2) - len(strip2) - start2) res1 = "-" * start_flank + res1 + "-" * end_flank res2 = "-" * start_flank + res2 + "-" * end_flank return res1, res2, score
def _build_stream_and_connection_data(msgs, limit): stream_data = defaultdict(list) conn_data = defaultdict(list) # Build streams and connections for msg in msgs: data = align.string_to_alignment(msg.data[:limit]) stream_data[msg.stream].append(data) conn_data[msg.conn].append(data) # Remove those with insufficient data tbr = [] for key in stream_data: if len(stream_data[key]) <= 1: tbr.append(key) for key in tbr: del stream_data[key] tbr = [] for key in conn_data: if len(conn_data[key]) <= 1: tbr.append(key) for key in tbr: del conn_data[key] return (stream_data, conn_data)
def _classify_cluster_fields(msgs, limit, sizes, max_num_flag_values, noise_ratio, num_iters, labels): data = [align.string_to_alignment(msg.data) for msg in msgs] # Create clusters from FD labels clusters = defaultdict(list) for (i, label) in enumerate(labels): clusters[label].append(i) est = defaultdict(lambda: defaultdict(dict)) for label in clusters: cluster_data = [data[i] for i in clusters[label]] if len(cluster_data) <= 1: continue aligned_data = _build_aligned_data(cluster_data, limit) samples = _build_samples(aligned_data) for size in sizes: est[label]['constants'][size] = constant(samples, size) est[label]['flags'][size] = flag(samples, size, max_num_flag_values) est[label]['uniforms'][size] = uniform(samples, size) est[label]['numbers'][size] = number(aligned_data, size) est[label]['incrementals'][size] = incremental(aligned_data, size) est[label]['lengths'][size] = length(cluster_data, size, noise_ratio, num_iters) return dict(est)
def align(self, seq1, seq2, local=False): s1 = align.string_to_alignment(seq1) s2 = align.string_to_alignment(seq2) score, a1, a2 = align.align(s1, s2, self.gap_open, self.gap_extend, self.subs, local) res1, res2 = align.alignment_to_string(a1), align.alignment_to_string( a2) if local: strip1, strip2 = res1.replace("-", ""), res2.replace("-", "") start1, start2 = seq1.index(strip1), seq2.index(strip2) start_flank = max(start1, start2) end_flank = max(len(seq1) - len(strip1) - start1, len(seq2) - len(strip2) - start2) res1 = "-" * start_flank + res1 + "-" * end_flank res2 = "-" * start_flank + res2 + "-" * end_flank return res1, res2, score
def consensus(seqs, minscore=0.95): ''' build consensus from sorted aligned reads iteratively, expects seqs to be sorted in ref genome order ''' S = -np.ones((256, 256)) + 2 * np.identity(256) S = S.astype(np.int16) if len(seqs) == 1: # no consensus necessary return seqs[0], 1.0 uniq_seqs = [seqs[0]] for i, seq in enumerate(seqs[1:], start=1): if seq != seqs[i-1]: uniq_seqs.append(seq) if len(uniq_seqs) == 1: # all seqs were the same! return uniq_seqs[0], 1.0 cons = uniq_seqs[0] scores = [] if len(uniq_seqs) > 1000: uniq_seqs = np.random.choice(uniq_seqs, size=1000) for seq in uniq_seqs[1:]: s1 = align.string_to_alignment(cons) s2 = align.string_to_alignment(seq) (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True) a1 = align.alignment_to_string(a1) a2 = ''.join([b for b in list(align.alignment_to_string(a2)) if b != '-']) score = float(len(a1) - (len(a1)-s)) / float(len(a1)) scores.append(score) if re.search(a1, cons): cons_start, cons_end = locate_subseq(cons, a1) if score >= minscore and cons_end > len(cons)-5: align_end = locate_subseq(seq, a2)[1] cons += seq[align_end:] return cons, np.mean(scores)
def _classify_global_fields(msgs, limit, sizes, max_num_flag_values, noise_ratio, num_iters): data = [align.string_to_alignment(msg.data) for msg in msgs] limited_data = [d[:limit] for d in data] (stream_data, conn_data) = _build_stream_and_connection_data(msgs, limit) global_samples = _build_samples(limited_data) conn_samples = {} stream_samples = {} for key in conn_data: conn_samples[key] = _build_samples(conn_data[key], zero=False, flag=False, uniform=False) for key in stream_data: stream_samples[key] = _build_samples(stream_data[key], zero=False, flag=False, uniform=False) global_est = defaultdict(dict) conn_est = defaultdict(dict) stream_est = defaultdict(dict) for size in sizes: global_est['constants'][size] = constant(global_samples, size) global_est['flags'][size] = flag(global_samples, size, max_num_flag_values) global_est['uniforms'][size] = uniform(global_samples, size) global_est['numbers'][size] = number(limited_data, size) global_est['lengths'][size] = length(data, size, noise_ratio, num_iters) conn_est['constants'][size] = _field_consensus(conn_samples, limit, size, constant) stream_est['constants'][size] = _field_consensus(stream_samples, limit, size, constant) stream_est['incrementals'][size] = _field_consensus(stream_data, limit, size, incremental) est = { 'global': global_est, 'connection': conn_est, 'stream': stream_est, } return dict(est)
def nwalign_wrapper(seq1, seq2, matrix=NUCMATRIX): s1 = align.string_to_alignment(seq1) s2 = align.string_to_alignment(seq2) (score, a1, a2) = align.align(s1, s2, -1, -1, matrix) return float(score) / len(a1)
def consensus(seqs, minscore=0.95): ''' build consensus from sorted aligned reads iteratively, expects seqs to be sorted in ref genome order ''' S = -np.ones((256, 256)) + 2 * np.identity(256) S = S.astype(np.int16) if len(seqs) == 0: return '', 0.0 if len(seqs) == 1: # no consensus necessary return seqs[0], 1.0 uniq_seqs = [seqs[0]] for i, seq in enumerate(seqs[1:], start=1): if seq != seqs[i - 1]: uniq_seqs.append(seq) if len(uniq_seqs) == 1: # all seqs were the same! return uniq_seqs[0], 1.0 start_index = 0 cons = uniq_seqs[start_index] scores = [] align_init = False for i, seq in enumerate(uniq_seqs[1:]): #print 'oldcons:', cons #print 'seq :', seq s1 = align.string_to_alignment(cons) s2 = align.string_to_alignment(seq) (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True) a1 = align.alignment_to_string(a1) a2 = ''.join( [b for b in list(align.alignment_to_string(a2)) if b != '-']) score = 0.0 if len(a1) > 0: score = float(len(a1) - (len(a1) - s)) / float(len(a1)) #print 'score :', score scores.append(score) if re.search(a1, cons): cons_start, cons_end = locate_subseq(cons, a1) if score >= minscore and cons_end > len(cons) - 5: align_end = locate_subseq(seq, a2)[1] cons += seq[align_end:] align_init = True #print 'newcons:', cons elif not align_init: # haven't found a scaffold yet start_index += 1 cons = uniq_seqs[start_index] #print '****' return cons, np.mean(scores)
def consensus(self, minscore=0.9): ''' build consensus from sorted aligned reads iteratively ''' S = -np.ones((256, 256)) + 2 * np.identity(256) S = S.astype(np.int16) minqual = self.reads[0].minqual sortable_reads = [SortableRead(sr.read) for sr in self.reads] seqs = [ qualtrim(sorted_read.read, minqual=minqual) for sorted_read in sorted(sortable_reads) ] seqs = [s for s in seqs if len(s) > 20] if len(seqs) == 0: return '', 0.0 if len(seqs) == 1: # no consensus necessary return seqs[0], 1.0 uniq_seqs = [seqs[0]] for i, seq in enumerate(seqs[1:], start=1): if seq != seqs[i - 1]: uniq_seqs.append(seq) if len(uniq_seqs) == 1: # all seqs were the same! return uniq_seqs[0], 1.0 cons = uniq_seqs[0] scores = [] if len(uniq_seqs) > 1000: uniq_seqs = [ uniq_seqs[u] for u in sorted( np.random.choice(range(len(uniq_seqs)), size=1000)) ] for seq in uniq_seqs[1:]: s1 = align.string_to_alignment(cons) s2 = align.string_to_alignment(seq) (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True) a1 = align.alignment_to_string(a1) a2 = ''.join( [b for b in list(align.alignment_to_string(a2)) if b != '-']) score = 0.0 if len(a1) > 0: score = float(len(a1) - (len(a1) - s)) / float(len(a1)) if re.search(a1, cons): cons_start, cons_end = locate_subseq(cons, a1) if score >= minscore and cons_end > len(cons) - 5: scores.append(score) align_end = locate_subseq(seq, a2)[1] cons += seq[align_end:] #print self.start, self.end, cons if scores: return cons, np.mean(scores) else: return cons, 0.0
def consensus(seqs, minscore=0.95): ''' build consensus from sorted aligned reads iteratively, expects seqs to be sorted in ref genome order ''' S = -np.ones((256, 256)) + 2 * np.identity(256) S = S.astype(np.int16) if len(seqs) == 0: return '', 0.0 if len(seqs) == 1: # no consensus necessary return seqs[0], 1.0 uniq_seqs = [seqs[0]] for i, seq in enumerate(seqs[1:], start=1): if seq != seqs[i-1]: uniq_seqs.append(seq) if len(uniq_seqs) == 1: # all seqs were the same! return uniq_seqs[0], 1.0 start_index = 0 cons = uniq_seqs[start_index] scores = [] align_init = False for i, seq in enumerate(uniq_seqs[1:]): #print 'oldcons:', cons #print 'seq :', seq s1 = align.string_to_alignment(cons) s2 = align.string_to_alignment(seq) (s, a1, a2) = align.align(s1, s2, -2, -2, S, local=True) a1 = align.alignment_to_string(a1) a2 = ''.join([b for b in list(align.alignment_to_string(a2)) if b != '-']) score = 0.0 if len(a1) > 0: score = float(len(a1) - (len(a1)-s)) / float(len(a1)) #print 'score :', score scores.append(score) if re.search(a1, cons): cons_start, cons_end = locate_subseq(cons, a1) if score >= minscore and cons_end > len(cons)-5: align_end = locate_subseq(seq, a2)[1] cons += seq[align_end:] align_init = True #print 'newcons:', cons elif not align_init: # haven't found a scaffold yet start_index += 1 cons = uniq_seqs[start_index] #print '****' return cons, np.mean(scores)