def check_read(segments, seq): from spoa import poa fasta = [ seq[int(i.split('-')[0]):int(i.split('-')[1])] for i in segments.split(';') ] poa(fasta, 1, True, -1, -1, -1, -1, -1)
def poa_consensus(self, additional_seq=None, method='spoa'): """Create a consensus sequence for the read.""" self.initialize() if method == 'spoa': seqs = list() for orient, subread in zip(*self.interleaved_subreads): if orient: seq = subread.seq else: seq = medaka.common.reverse_complement(subread.seq) seqs.append(seq) consensus_seq, _ = spoa.poa(seqs, genmsa=False) elif method == 'racon': with tempfile.NamedTemporaryFile( 'w', suffix='.fasta', delete=False) as fh: if additional_seq is not None: fh.write(">{}\n{}\n".format('additional', additional_seq)) for orient, subread in zip(self._orient, self.subreads): fh.write(">{}\n{}\n".format(subread.name, subread.seq)) fh.flush() consensus_seq = self._run_racon(fh.name) else: raise ValueError('Unrecognised method: {}.'.format(method)) self.consensus = consensus_seq self._alignments_valid = False self.consensus_run = True return consensus_seq
def cluster_sequence(hpc_freq, sequence): from scipy.cluster.hierarchy import linkage, leaves_list from scipy.spatial.distance import squareform from Levenshtein import distance from spoa import poa if len(hpc_freq) == 1: return hpc_freq # Calculate distance between segments dist = np.zeros((len(hpc_freq), len(hpc_freq))) for i in range(len(hpc_freq)): for j in range(len(hpc_freq)): if i > j: continue dist[i][j] = distance(hpc_freq[i][0], hpc_freq[j][0]) / max( len(hpc_freq[i][0]), len(hpc_freq[j][0])) dist = dist + dist.T # Hierarchical Cluster if dist.sum() != 0: z = leaves_list( linkage(squareform(dist), "ward", optimal_ordering=True)) else: z = list(range(len(hpc_freq))) clusters = [[ z[0], ]] for i, j in pairwise(z): if i > j: if dist[j][i] < 0.3: clusters[-1].append(j) else: clusters.append([ j, ]) else: if dist[i][j] < 0.3: clusters[-1].append(j) else: clusters.append([ j, ]) ccs_seq = [] for cluster in clusters: if len(cluster) == 1: ccs_seq.append((hpc_freq[cluster[0]])) continue cluster_reads = flatten([hpc_freq[i][1] for i in cluster]) cluster_seq = [sequence[i] for i in cluster_reads] # Generate consensus sequence ccs, _ = poa(cluster_seq, 2, False, 10, -4, -8, -2, -24, -1) ccs_seq.append((ccs, cluster_reads)) return ccs_seq
def test_poa(): fasta = ( ('0-145', 'TCCCGGTCATCATAACCCCGATCGTACCCTCTGTCATAATAGTCTCGGCGGCGAGAACTGCCACTGTAAATCTGATCCCTGTCTTGAGCTGCTCTCCATCCACCTCCCTCCACCACCTCCTCCTCTGTATGATCTGCTGTAATAG'), ('145-289', 'TCCCGGTCATCATAACCCCGATCATTGCCACCTGTCATAGTCTCGGCGGCGAGAACTGCCACTGTAAATCCCCTGATCCCTGTCTTGAGCTGCTCTCCATCCCCTCCTCCACCACCTCCTCCTCTGTATGATCTGCTGTAATAG'), ('289-433', 'TCCCGGTCATCATAACCCCGATCGTACCCTCTGTCATAATGGTCTCGGCGGCGAGAACTGCCACTGTAAATCTGATCCCTGTCTTGAGCTGCTCTCCATCCACCTCCTCCACCACCTCCTCCTCTGTATGATCTGCTGTAATAG'), ('433-579', 'TCCCGGTCATCATAACCCCGATCGTACTCTGTCATAATAGTCTCGGCGGCGAGAGGCGCCACTGTAAATCTGATCCCTGTCTTGAGCTGCTCTCCATCCACCTCCTCCACCACCTCCTCCCCTCTGTATGATCTGCTGTAATAG'), ('579-721', 'TCCCGGTCATCATAACCCCGATCGTACCCATAATAGTCTCGGCGAGAACTGCCACTGTAAATCCTGATCCCTGTCTTGAGCTGCTCTCCATCCACCTCCTCCACCACCTCCTCCTCTGTATGATCTGCTGTAATAG'), ('721-742', 'TCCGGTCATCATAACCCCGATCCATAATAGTCTCGGCG'), ) ccs, msa = poa([i[1] for i in fasta], 0, True, 10, -4, -8, -2, -24, -1) print(ccs, msa)
def find_consensus(header, seq): from spoa import poa from Levenshtein import distance # if header not in 'ENSMUST00000021181|ENSMUSG00000020831|11:70236877-70237625|-|197_1126_aligned_43558_F_52_821_60': # return None, None, None # Trim sequence if len(seq) <= 50: return None, None, None # Repeat segments chains = None is_circular = 0 for k, is_hpc in [(11, False), (8, False), (11, True), (8, True)]: tmp_chain, tmp_circular = circular_finder(header, seq, k=k, use_hpc=is_hpc) if tmp_chain is None: continue if tmp_circular == 1: chains = tmp_chain is_circular = 1 break elif chains is None: chains = tmp_chain else: pass if chains is None: return None, None, None # Chains if len(chains) < 2: return None, None, None fasta = [seq[s:e] for s, e in chains] ccs, _ = poa(fasta, 2, False, 10, -4, -8, -2, -24, -1) # Check segment similarity tail = fasta[-1] if len(fasta) == 2: dis_body = distance(fasta[0][:len(tail)], ccs[:len(tail)]) / len(tail) else: dis_body = max([distance(i, ccs) / len(ccs) for i in fasta[:-1]]) dis_tail = distance(tail, ccs[:len(tail)]) / len(tail) if dis_body > 0.2 or dis_tail > 0.35: return None, None, None segments = ';'.join(['{}-{}'.format(s, e) for s, e in chains]) return segments, ccs, is_circular
def correct_cluster(cluster, is_debug=False, max_cluster=200): from random import sample from collections import Counter from spoa import poa from libs.striped_smith_waterman.ssw_wrap import Aligner if cluster is None: return None if len(cluster) <= 1: return None if 'full' not in set([i.type for i in cluster]): return None counter = Counter([i.circ_id for i in cluster if i.type == 'full']).most_common(n=1) ref = sorted([ i for i in cluster if i.circ_id == counter[0][0] and i.type == 'full' ], key=lambda x: len(x.seq), reverse=True)[0] ssw = Aligner(ref.seq[:50], match=10, mismatch=4, gap_open=8, gap_extend=2) head_pos = [] for query in cluster[1:]: alignment = ssw.align(query.seq) head_pos.append(alignment.ref_begin) template = transform_seq(ref.seq, max(head_pos)) ssw = Aligner(template, match=10, mismatch=4, gap_open=8, gap_extend=2) junc_seqs = [ get_junc_seq(template, -max(head_pos) // 2, 25), ] for query in cluster[1:]: alignment = ssw.align(query.seq) tmp = transform_seq(query.seq, alignment.query_begin) junc_seqs.append(get_junc_seq(tmp, -max(head_pos) // 2, 25)) cs_junc, _ = poa(junc_seqs, 2, False, 10, -4, -8, -2, -24, -1) ctg = Counter([i.circ_id.split(':')[0] for i in cluster]).most_common()[0][0] tmp_st = [int(i.circ_id.split(':')[1].split('-')[0]) for i in cluster] tmp_en = [int(i.circ_id.split(':')[1].split('-')[1]) for i in cluster] # Curate junction sequence scores = curate_junction(ctg, tmp_st, tmp_en, cs_junc) aval_junc = min_sorted_items(scores, 2) if aval_junc: anno_junc = annotated_hit(ctg, aval_junc) if anno_junc: anno_junc = sorted(anno_junc, key=lambda x: junc_score(ctg, x, junc_seqs), reverse=True) circ_start, circ_end, circ_score = anno_junc[0] else: aval_junc = sorted(aval_junc, key=lambda x: junc_score(ctg, x, junc_seqs), reverse=True) circ_start, circ_end, circ_score = aval_junc[0] else: circ_start, circ_end = counter[0][0].split(':')[1].split('-') circ_start, circ_end = int(circ_start), int(circ_end) # Annotated sites for shift_threshold in [5, 10]: ss_site, us_free, ds_free, tmp_signal = find_annotated_signal( ctg, circ_start, circ_end, 0, 10, shift_threshold) if ss_site is not None: ss_id, strand, us_shift, ds_shift = ss_site circ_start += us_shift circ_end += ds_shift circ_type = 'Annotated' break host_strand = find_host_gene(ctg, circ_start, circ_end) circ_type = None # Canonical sites if ss_site is None: for shift_threshold in [5, 10]: ss_site = find_denovo_signal(ctg, circ_start, circ_end, host_strand, tmp_signal, us_free, ds_free, 0, 10, shift_threshold, True) if ss_site is not None: ss_id, strand, us_shift, ds_shift = ss_site circ_start += us_shift circ_end += ds_shift circ_type = 'Annotated' break # Intronic circRNAs if ss_site is None: retained_introns = find_retained_introns(ctg, circ_start + 1, circ_end) overlap_exons = find_overlap_exons(ctg, circ_start + 1, circ_end) is_lariat = 0 if retained_introns is not None and overlap_exons is None: is_lariat = 1 # Find high-confidence ciRNAs retained_introns = set( sum([i for _, i in retained_introns.items()], [])) retained_strand = set([i[2] for i in retained_introns]) tmp_circ = [] for intron_start, intron_end, intron_strand in retained_introns: if abs(intron_start - circ_start) > 50 or abs(intron_end - circ_end) > 50: continue if intron_strand == '+': tmp_site = [i for i in scores if i[0] == intron_start] else: tmp_site = [i for i in scores if i[1] == intron_end] if tmp_site: tmp_circ.append([*tmp_site[0], intron_strand]) ss_id = 'lariat' if tmp_circ: circ_start, circ_end, circ_score, strand = sorted( tmp_circ, key=lambda x: x[2])[0] circ_type = 'High confidence lariat' else: # Lariat with recursive splicing branchpoint is_lariat = 0 tmp_circ = [] for tmp_strand in retained_strand: tmp_start, tmp_end, tmp_score = recursive_splice_site( scores, ctg, tmp_strand) if tmp_score is not None: tmp_circ.append( [tmp_start, tmp_end, tmp_score, tmp_strand]) if tmp_circ: circ_start, circ_end, circ_score, strand = sorted( tmp_circ, key=lambda x: x[2])[0] # cnt['Recursive splicing lariat'] += 1 else: # cnt['Unknown lariat'] += 1 strand = 'None' # Find denovo splice signal if is_lariat == 0: ss_site = find_denovo_signal(ctg, circ_start, circ_end, host_strand, tmp_signal, us_free, ds_free, 5, 10, 3, False) if ss_site is not None: ss_id, strand, us_shift, ds_shift = ss_site circ_start += us_shift circ_end += ds_shift circ_type = 'Denovo signal' else: ss_id = 'None' strand = 'None' circ_type = 'Unknown signal' circ_id = '{}:{}-{}'.format(ctg, circ_start + 1, circ_end) # refined sequence cluster_seq = [] circ_junc_seq = genome_junction_seq(ctg, circ_start, circ_end) ssw = Aligner(circ_junc_seq, match=10, mismatch=4, gap_open=8, gap_extend=2, report_cigar=True) tmp_cluster = [i for i in cluster if i.type == 'full'] if len(tmp_cluster) > max_cluster: tmp_cluster = sample(tmp_cluster, max_cluster) tmp_cluster = sorted(tmp_cluster, key=lambda x: len(x.seq), reverse=True) for query in tmp_cluster: alignment = ssw.align(query.seq * 2) tmp_pos = find_alignment_pos(alignment, len(circ_junc_seq) // 2) if tmp_pos is None: cluster_seq.append((query.read_id, query.seq)) else: tmp_seq = transform_seq(query.seq, tmp_pos % len(query.seq)) cluster_seq.append((query.read_id, tmp_seq)) cluster_res = batch_cluster_sequence(circ_id, cluster_seq) cluster_res = sorted(cluster_res, key=lambda x: len(x[1]), reverse=True) circ = CIRC(ctg, circ_start + 1, circ_end, strand) circ_id = '{}:{}-{}'.format(circ.contig, circ.start, circ.end) if len(cluster_res) > 2 and len( cluster_res[0][1]) >= 0.5 * max(len(tmp_cluster), 10): tmp_res = correct_cluster( [i for i in cluster if i.read_id in cluster_res[0][1]], True) if tmp_res is not None: circ = tmp_res circ_id = '{}:{}-{}'.format(circ.contig, circ.start, circ.end) # Filter out strange cirexons curated_exons = curate_cirexons(circ, cluster) if curated_exons is None: return None isoforms, isoform_reads, circ_len = curate_isoform(circ, curated_exons, cluster_res) if isoforms is None: return None is_concordance = check_isoforms(circ, isoforms) if not is_concordance: return None if is_debug: return circ return circ_type, ([i.read_id for i in cluster ], isoform_reads, cluster_seq, circ_id, circ.strand, ss_id, us_free, ds_free, circ_len, isoforms)
def poa(seqs, allseq=False): con, msa = spoa.poa(seqs, genmsa=False) if allseq: return (con, *seqs) return (con, )
def test_bindings(self): """ simple poa to check bindings""" consensus, msa = poa(['AACTTATA', 'AACTTATG', 'AACTATA']) self.assertEqual(consensus, 'AACTTATA') self.assertEqual(len(msa), 3)
def test_bindings_no_msa(self): """ simple poa to check bindings with msa generation""" consensus, msa = poa(['AACTTATA', 'AACTTATG', 'AACTATA'], genmsa=False) self.assertEqual(consensus, 'AACTTATA') self.assertEqual(len(msa), 0)