Beispiel #1
0
def check_read(segments, seq):
    from spoa import poa
    fasta = [
        seq[int(i.split('-')[0]):int(i.split('-')[1])]
        for i in segments.split(';')
    ]
    poa(fasta, 1, True, -1, -1, -1, -1, -1)
Beispiel #2
0
 def poa_consensus(self, additional_seq=None, method='spoa'):
     """Create a consensus sequence for the read."""
     self.initialize()
     if method == 'spoa':
         seqs = list()
         for orient, subread in zip(*self.interleaved_subreads):
             if orient:
                 seq = subread.seq
             else:
                 seq = medaka.common.reverse_complement(subread.seq)
             seqs.append(seq)
         consensus_seq, _ = spoa.poa(seqs, genmsa=False)
     elif method == 'racon':
         with tempfile.NamedTemporaryFile(
                 'w', suffix='.fasta', delete=False) as fh:
             if additional_seq is not None:
                 fh.write(">{}\n{}\n".format('additional', additional_seq))
             for orient, subread in zip(self._orient, self.subreads):
                 fh.write(">{}\n{}\n".format(subread.name, subread.seq))
             fh.flush()
             consensus_seq = self._run_racon(fh.name)
     else:
         raise ValueError('Unrecognised method: {}.'.format(method))
     self.consensus = consensus_seq
     self._alignments_valid = False
     self.consensus_run = True
     return consensus_seq
Beispiel #3
0
def cluster_sequence(hpc_freq, sequence):
    from scipy.cluster.hierarchy import linkage, leaves_list
    from scipy.spatial.distance import squareform
    from Levenshtein import distance
    from spoa import poa

    if len(hpc_freq) == 1:
        return hpc_freq

    # Calculate distance between segments
    dist = np.zeros((len(hpc_freq), len(hpc_freq)))
    for i in range(len(hpc_freq)):
        for j in range(len(hpc_freq)):
            if i > j:
                continue
            dist[i][j] = distance(hpc_freq[i][0], hpc_freq[j][0]) / max(
                len(hpc_freq[i][0]), len(hpc_freq[j][0]))
    dist = dist + dist.T

    # Hierarchical Cluster
    if dist.sum() != 0:
        z = leaves_list(
            linkage(squareform(dist), "ward", optimal_ordering=True))
    else:
        z = list(range(len(hpc_freq)))

    clusters = [[
        z[0],
    ]]
    for i, j in pairwise(z):
        if i > j:
            if dist[j][i] < 0.3:
                clusters[-1].append(j)
            else:
                clusters.append([
                    j,
                ])
        else:
            if dist[i][j] < 0.3:
                clusters[-1].append(j)
            else:
                clusters.append([
                    j,
                ])

    ccs_seq = []
    for cluster in clusters:
        if len(cluster) == 1:
            ccs_seq.append((hpc_freq[cluster[0]]))
            continue

        cluster_reads = flatten([hpc_freq[i][1] for i in cluster])
        cluster_seq = [sequence[i] for i in cluster_reads]

        # Generate consensus sequence
        ccs, _ = poa(cluster_seq, 2, False, 10, -4, -8, -2, -24, -1)
        ccs_seq.append((ccs, cluster_reads))
    return ccs_seq
Beispiel #4
0
def test_poa():
    fasta = (
        ('0-145', 'TCCCGGTCATCATAACCCCGATCGTACCCTCTGTCATAATAGTCTCGGCGGCGAGAACTGCCACTGTAAATCTGATCCCTGTCTTGAGCTGCTCTCCATCCACCTCCCTCCACCACCTCCTCCTCTGTATGATCTGCTGTAATAG'),
        ('145-289', 'TCCCGGTCATCATAACCCCGATCATTGCCACCTGTCATAGTCTCGGCGGCGAGAACTGCCACTGTAAATCCCCTGATCCCTGTCTTGAGCTGCTCTCCATCCCCTCCTCCACCACCTCCTCCTCTGTATGATCTGCTGTAATAG'),
        ('289-433', 'TCCCGGTCATCATAACCCCGATCGTACCCTCTGTCATAATGGTCTCGGCGGCGAGAACTGCCACTGTAAATCTGATCCCTGTCTTGAGCTGCTCTCCATCCACCTCCTCCACCACCTCCTCCTCTGTATGATCTGCTGTAATAG'),
        ('433-579', 'TCCCGGTCATCATAACCCCGATCGTACTCTGTCATAATAGTCTCGGCGGCGAGAGGCGCCACTGTAAATCTGATCCCTGTCTTGAGCTGCTCTCCATCCACCTCCTCCACCACCTCCTCCCCTCTGTATGATCTGCTGTAATAG'),
        ('579-721', 'TCCCGGTCATCATAACCCCGATCGTACCCATAATAGTCTCGGCGAGAACTGCCACTGTAAATCCTGATCCCTGTCTTGAGCTGCTCTCCATCCACCTCCTCCACCACCTCCTCCTCTGTATGATCTGCTGTAATAG'),
        ('721-742', 'TCCGGTCATCATAACCCCGATCCATAATAGTCTCGGCG'),
    )
    ccs, msa = poa([i[1] for i in fasta], 0, True, 10, -4, -8, -2, -24, -1)
    print(ccs, msa)
Beispiel #5
0
def find_consensus(header, seq):
    from spoa import poa
    from Levenshtein import distance

    # if header not in 'ENSMUST00000021181|ENSMUSG00000020831|11:70236877-70237625|-|197_1126_aligned_43558_F_52_821_60':
    #     return None, None, None

    # Trim sequence
    if len(seq) <= 50:
        return None, None, None

    # Repeat segments
    chains = None
    is_circular = 0
    for k, is_hpc in [(11, False), (8, False), (11, True), (8, True)]:
        tmp_chain, tmp_circular = circular_finder(header,
                                                  seq,
                                                  k=k,
                                                  use_hpc=is_hpc)
        if tmp_chain is None:
            continue
        if tmp_circular == 1:
            chains = tmp_chain
            is_circular = 1
            break
        elif chains is None:
            chains = tmp_chain
        else:
            pass
    if chains is None:
        return None, None, None

    # Chains
    if len(chains) < 2:
        return None, None, None

    fasta = [seq[s:e] for s, e in chains]
    ccs, _ = poa(fasta, 2, False, 10, -4, -8, -2, -24, -1)

    # Check segment similarity
    tail = fasta[-1]
    if len(fasta) == 2:
        dis_body = distance(fasta[0][:len(tail)], ccs[:len(tail)]) / len(tail)
    else:
        dis_body = max([distance(i, ccs) / len(ccs) for i in fasta[:-1]])
    dis_tail = distance(tail, ccs[:len(tail)]) / len(tail)

    if dis_body > 0.2 or dis_tail > 0.35:
        return None, None, None

    segments = ';'.join(['{}-{}'.format(s, e) for s, e in chains])

    return segments, ccs, is_circular
Beispiel #6
0
def correct_cluster(cluster, is_debug=False, max_cluster=200):
    from random import sample
    from collections import Counter
    from spoa import poa
    from libs.striped_smith_waterman.ssw_wrap import Aligner

    if cluster is None:
        return None
    if len(cluster) <= 1:
        return None
    if 'full' not in set([i.type for i in cluster]):
        return None

    counter = Counter([i.circ_id for i in cluster
                       if i.type == 'full']).most_common(n=1)
    ref = sorted([
        i for i in cluster if i.circ_id == counter[0][0] and i.type == 'full'
    ],
                 key=lambda x: len(x.seq),
                 reverse=True)[0]
    ssw = Aligner(ref.seq[:50], match=10, mismatch=4, gap_open=8, gap_extend=2)

    head_pos = []
    for query in cluster[1:]:
        alignment = ssw.align(query.seq)
        head_pos.append(alignment.ref_begin)

    template = transform_seq(ref.seq, max(head_pos))
    ssw = Aligner(template, match=10, mismatch=4, gap_open=8, gap_extend=2)
    junc_seqs = [
        get_junc_seq(template, -max(head_pos) // 2, 25),
    ]

    for query in cluster[1:]:
        alignment = ssw.align(query.seq)
        tmp = transform_seq(query.seq, alignment.query_begin)
        junc_seqs.append(get_junc_seq(tmp, -max(head_pos) // 2, 25))

    cs_junc, _ = poa(junc_seqs, 2, False, 10, -4, -8, -2, -24, -1)

    ctg = Counter([i.circ_id.split(':')[0]
                   for i in cluster]).most_common()[0][0]
    tmp_st = [int(i.circ_id.split(':')[1].split('-')[0]) for i in cluster]
    tmp_en = [int(i.circ_id.split(':')[1].split('-')[1]) for i in cluster]

    # Curate junction sequence
    scores = curate_junction(ctg, tmp_st, tmp_en, cs_junc)
    aval_junc = min_sorted_items(scores, 2)
    if aval_junc:
        anno_junc = annotated_hit(ctg, aval_junc)
        if anno_junc:
            anno_junc = sorted(anno_junc,
                               key=lambda x: junc_score(ctg, x, junc_seqs),
                               reverse=True)
            circ_start, circ_end, circ_score = anno_junc[0]
        else:
            aval_junc = sorted(aval_junc,
                               key=lambda x: junc_score(ctg, x, junc_seqs),
                               reverse=True)
            circ_start, circ_end, circ_score = aval_junc[0]
    else:
        circ_start, circ_end = counter[0][0].split(':')[1].split('-')
        circ_start, circ_end = int(circ_start), int(circ_end)

    # Annotated sites
    for shift_threshold in [5, 10]:
        ss_site, us_free, ds_free, tmp_signal = find_annotated_signal(
            ctg, circ_start, circ_end, 0, 10, shift_threshold)
        if ss_site is not None:
            ss_id, strand, us_shift, ds_shift = ss_site
            circ_start += us_shift
            circ_end += ds_shift
            circ_type = 'Annotated'
            break

    host_strand = find_host_gene(ctg, circ_start, circ_end)
    circ_type = None

    # Canonical sites
    if ss_site is None:
        for shift_threshold in [5, 10]:
            ss_site = find_denovo_signal(ctg, circ_start, circ_end,
                                         host_strand, tmp_signal, us_free,
                                         ds_free, 0, 10, shift_threshold, True)
            if ss_site is not None:
                ss_id, strand, us_shift, ds_shift = ss_site
                circ_start += us_shift
                circ_end += ds_shift
                circ_type = 'Annotated'
                break

    # Intronic circRNAs
    if ss_site is None:
        retained_introns = find_retained_introns(ctg, circ_start + 1, circ_end)
        overlap_exons = find_overlap_exons(ctg, circ_start + 1, circ_end)

        is_lariat = 0
        if retained_introns is not None and overlap_exons is None:
            is_lariat = 1
            # Find high-confidence ciRNAs
            retained_introns = set(
                sum([i for _, i in retained_introns.items()], []))
            retained_strand = set([i[2] for i in retained_introns])
            tmp_circ = []
            for intron_start, intron_end, intron_strand in retained_introns:
                if abs(intron_start - circ_start) > 50 or abs(intron_end -
                                                              circ_end) > 50:
                    continue
                if intron_strand == '+':
                    tmp_site = [i for i in scores if i[0] == intron_start]
                else:
                    tmp_site = [i for i in scores if i[1] == intron_end]
                if tmp_site:
                    tmp_circ.append([*tmp_site[0], intron_strand])

            ss_id = 'lariat'
            if tmp_circ:
                circ_start, circ_end, circ_score, strand = sorted(
                    tmp_circ, key=lambda x: x[2])[0]
                circ_type = 'High confidence lariat'
            else:
                # Lariat with recursive splicing branchpoint
                is_lariat = 0
                tmp_circ = []
                for tmp_strand in retained_strand:
                    tmp_start, tmp_end, tmp_score = recursive_splice_site(
                        scores, ctg, tmp_strand)
                    if tmp_score is not None:
                        tmp_circ.append(
                            [tmp_start, tmp_end, tmp_score, tmp_strand])
                if tmp_circ:
                    circ_start, circ_end, circ_score, strand = sorted(
                        tmp_circ, key=lambda x: x[2])[0]
                    # cnt['Recursive splicing lariat'] += 1
                else:
                    # cnt['Unknown lariat'] += 1
                    strand = 'None'

        # Find denovo splice signal
        if is_lariat == 0:
            ss_site = find_denovo_signal(ctg, circ_start, circ_end,
                                         host_strand, tmp_signal, us_free,
                                         ds_free, 5, 10, 3, False)
            if ss_site is not None:
                ss_id, strand, us_shift, ds_shift = ss_site
                circ_start += us_shift
                circ_end += ds_shift
                circ_type = 'Denovo signal'
            else:
                ss_id = 'None'
                strand = 'None'
                circ_type = 'Unknown signal'

    circ_id = '{}:{}-{}'.format(ctg, circ_start + 1, circ_end)

    # refined sequence
    cluster_seq = []
    circ_junc_seq = genome_junction_seq(ctg, circ_start, circ_end)
    ssw = Aligner(circ_junc_seq,
                  match=10,
                  mismatch=4,
                  gap_open=8,
                  gap_extend=2,
                  report_cigar=True)

    tmp_cluster = [i for i in cluster if i.type == 'full']
    if len(tmp_cluster) > max_cluster:
        tmp_cluster = sample(tmp_cluster, max_cluster)
    tmp_cluster = sorted(tmp_cluster, key=lambda x: len(x.seq), reverse=True)

    for query in tmp_cluster:
        alignment = ssw.align(query.seq * 2)
        tmp_pos = find_alignment_pos(alignment, len(circ_junc_seq) // 2)
        if tmp_pos is None:
            cluster_seq.append((query.read_id, query.seq))
        else:
            tmp_seq = transform_seq(query.seq, tmp_pos % len(query.seq))
            cluster_seq.append((query.read_id, tmp_seq))

    cluster_res = batch_cluster_sequence(circ_id, cluster_seq)
    cluster_res = sorted(cluster_res, key=lambda x: len(x[1]), reverse=True)

    circ = CIRC(ctg, circ_start + 1, circ_end, strand)
    circ_id = '{}:{}-{}'.format(circ.contig, circ.start, circ.end)

    if len(cluster_res) > 2 and len(
            cluster_res[0][1]) >= 0.5 * max(len(tmp_cluster), 10):
        tmp_res = correct_cluster(
            [i for i in cluster if i.read_id in cluster_res[0][1]], True)
        if tmp_res is not None:
            circ = tmp_res
            circ_id = '{}:{}-{}'.format(circ.contig, circ.start, circ.end)

    # Filter out strange cirexons
    curated_exons = curate_cirexons(circ, cluster)
    if curated_exons is None:
        return None
    isoforms, isoform_reads, circ_len = curate_isoform(circ, curated_exons,
                                                       cluster_res)
    if isoforms is None:
        return None
    is_concordance = check_isoforms(circ, isoforms)
    if not is_concordance:
        return None

    if is_debug:
        return circ

    return circ_type, ([i.read_id for i in cluster
                        ], isoform_reads, cluster_seq, circ_id, circ.strand,
                       ss_id, us_free, ds_free, circ_len, isoforms)
Beispiel #7
0
def poa(seqs, allseq=False):
    con, msa = spoa.poa(seqs, genmsa=False)
    if allseq: return (con, *seqs)
    return (con, )
Beispiel #8
0
 def test_bindings(self):
     """ simple poa to check bindings"""
     consensus, msa = poa(['AACTTATA', 'AACTTATG', 'AACTATA'])
     self.assertEqual(consensus, 'AACTTATA')
     self.assertEqual(len(msa), 3)
Beispiel #9
0
 def test_bindings_no_msa(self):
     """ simple poa to check bindings with msa generation"""
     consensus, msa = poa(['AACTTATA', 'AACTTATG', 'AACTATA'], genmsa=False)
     self.assertEqual(consensus, 'AACTTATA')
     self.assertEqual(len(msa), 0)