Beispiel #1
0
def scan_raw_chunk(chunk, is_canonical, circ_reads):
    reads_cnt = defaultdict(int)

    ret = []
    short_reads = []

    for read_id, seq in chunk:
        if read_id in circ_reads:
            continue

        # API for short reads
        if len(seq) < 300:
            short_reads.append((read_id, seq))
            continue

        # Remove reads that have ambiguous mapping
        raw_hits = sorted([i for i in env.ALIGNER.map(seq) if i.is_primary],
                          key=lambda x: [x.q_st, x.q_en])
        if len(raw_hits) == 0:
            continue
        elif len(raw_hits) == 1:
            raw_hit = remove_long_insert(raw_hits[0])
            if raw_hit.mlen < len(seq) * .45 or raw_hit.mlen > len(seq) - 50:
                continue
            if raw_hit.q_st < 50 and raw_hit.q_en > len(seq) - 50:
                continue
            circ, junc = find_bsj(seq)
            if junc is None:
                continue

        elif len(raw_hits) == 2:
            head, tail = remove_long_insert(raw_hits[0]), remove_long_insert(
                raw_hits[1])
            if head.ctg != tail.ctg:
                continue
            if not head.q_st + head.mlen * 0.45 < tail.q_st:
                continue
            if head.r_en - 20 < tail.r_st:
                continue
            if head.q_en < tail.q_st - 50:
                continue
            circ, junc = find_bsj(seq)
            if junc is None or junc < head.q_en - 10 or junc > tail.q_st + 10:
                continue
        else:
            continue

        circ_hits = sorted([
            remove_long_insert(i)
            for i in env.ALIGNER.map(circ) if i.is_primary
        ],
                           key=lambda x: [x.q_st, x.q_en])
        if len(circ_hits) == 0:
            continue
        elif len(circ_hits) == 1:
            circ_hit = circ_hits[0]
            if circ_hit.mlen <= max([i.mlen for i in raw_hits]):
                continue
            if min(junc, len(seq) - junc) < 30:
                continue
            if not junc + circ_hit.q_st < len(seq) < junc + circ_hit.q_en:
                continue
            circ_ctg, circ_start, circ_end, circ_strand = circ_hit.ctg, circ_hit.r_st, circ_hit.r_en, circ_hit.strand
            clip_base = circ_hit.q_st + len(seq) - circ_hit.q_en
            cir_exons = get_parital_blocks(circ_hit, len(seq) - junc)
        elif len(circ_hits) == 2:
            head, tail = circ_hits[0], circ_hits[1]
            if head.ctg != tail.ctg or head.strand != tail.strand:
                continue
            if not head.q_st + (head.q_en - head.q_st) * 0.5 < tail.q_st:
                continue
            if head.r_en - 20 < tail.r_st:
                continue
            if head.q_en < tail.q_st - 20:
                continue
            circ_ctg, circ_start, circ_end, circ_strand = head.ctg, tail.r_st, head.r_en, head.strand
            clip_base = abs(tail.q_st - head.q_en)

            head_exons = get_blocks(head)
            tail_exons = get_blocks(tail)

            cir_exons = merge_exons(tail_exons, head_exons)

            circ = circ[tail.q_st:] + circ[:tail.q_st]
        else:
            continue

        if clip_base > 20:
            continue

        # Retrive circRNA positions, convert minimap2 position to real position
        host_strand = find_host_gene(circ_ctg, circ_start, circ_end)
        try:
            ss_site, us_free, ds_free, tmp_signal = find_annotated_signal(
                circ_ctg, circ_start, circ_end, clip_base, clip_base + 10)
        except Exception as e:
            LOGGER.warn(e)
        if ss_site is None:
            ss_site = find_denovo_signal(circ_ctg, circ_start, circ_end,
                                         host_strand, tmp_signal, us_free,
                                         ds_free, clip_base, clip_base + 10, 3,
                                         True)

        if ss_site is None:
            strand = 'NA'
            ss_id = 'NA'
            correction_shift = 0
        else:
            ss_id, strand, us_shift, ds_shift = ss_site
            circ_start += us_shift
            circ_end += ds_shift
            correction_shift = min(max(us_shift, -us_free), ds_free)

        circ_id = '{}:{}-{}'.format(circ_ctg, circ_start + 1, circ_end)
        cir_exons[0][0] = circ_start
        cir_exons[-1][1] = circ_end

        cir_exon_tag = []
        for cir_exon_start, cir_exon_end, cir_exon_len in cir_exons:
            cir_exon_tag.append('{}-{}|{}'.format(cir_exon_start, cir_exon_end,
                                                  cir_exon_len))

        circ_seq = circ if circ_strand > 0 else revcomp(circ)
        circ_seq = circ_seq[correction_shift:] + circ_seq[:correction_shift]

        ret.append((read_id, circ_id, strand, ','.join(cir_exon_tag), ss_id,
                    '{}|{}-NA'.format(junc, clip_base), 'partial', circ_seq))

        reads_cnt['partial'] += 1

    return reads_cnt, ret, short_reads
Beispiel #2
0
def align_clip_segments(circ, hit):
    """
    Align clip bases
    """
    from libs.striped_smith_waterman.ssw_wrap import Aligner
    from collections import Counter
    st_clip, en_clip = hit.q_st, len(circ) - hit.q_en
    clip_r_st, clip_r_en, clipped_circ = None, None, None

    if st_clip + en_clip >= 20:
        clip_seq = circ[hit.q_en:] + circ[:hit.q_st]
        if len(clip_seq) > 0.6 * len(circ):
            return None, None, None, None

        tmp_start = max(hit.r_st - 200000, 0)
        tmp_end = min(hit.r_en + 200000, env.CONTIG_LEN[hit.ctg])

        tmp_seq = env.GENOME.seq(hit.ctg, tmp_start, tmp_end)
        if Counter(tmp_seq)['N'] >= 0.3 * (tmp_end - tmp_start):
            return None, None, None, None

        if hit.strand > 0:
            ssw = Aligner(tmp_seq,
                          match=1,
                          mismatch=1,
                          gap_open=1,
                          gap_extend=1)
            align_res = ssw.align(clip_seq)
            clip_r_st, clip_r_en = tmp_start + align_res.ref_begin, tmp_start + align_res.ref_end
            if clip_r_st < hit.r_st:
                clipped_circ = clip_seq[align_res.query_begin:] + \
                               circ[hit.q_st:hit.q_en] + \
                               clip_seq[:align_res.query_begin]
            else:
                clipped_circ = circ[hit.q_st:] + circ[:hit.q_st]
        else:
            ssw = Aligner(revcomp(tmp_seq),
                          match=1,
                          mismatch=1,
                          gap_open=1,
                          gap_extend=1)
            align_res = ssw.align(clip_seq)
            clip_r_st, clip_r_en = tmp_end - align_res.ref_end, tmp_end - align_res.ref_begin
            if clip_r_en > hit.r_en:
                clipped_circ = clip_seq[align_res.query_begin:] + \
                               circ[hit.q_st:hit.q_en] + \
                               clip_seq[:align_res.query_begin]
            else:
                clipped_circ = circ[hit.q_st:] + circ[:hit.q_st]

        clip_base = hit.q_st + len(circ) - hit.q_en - (
            align_res.query_end - align_res.query_begin) + 1
        circ_start = min(hit.r_st, clip_r_st) - 1
        circ_end = max(hit.r_en, clip_r_en)
    else:
        clipped_circ = circ[hit.q_st:] + circ[:hit.q_st]
        clip_base = st_clip + en_clip
        circ_start = hit.r_st - 1
        circ_end = hit.r_en

    return clipped_circ, circ_start, circ_end, (clip_r_st, clip_r_en,
                                                clip_base)
Beispiel #3
0
def recover_ccs_chunk(chunk, is_canonical):
    reads_cnt = defaultdict(int)
    ret = []

    for read_id, segments, ccs, raw in chunk:
        # Remove other mapped region that intersect with ccs
        seg_st = int(segments.split(';')[0].split('-')[0])
        seg_en = int(segments.split(';')[-1].split('-')[1])

        ccs_hit = get_primary_alignment(env.ALIGNER.map(ccs * 2))
        if ccs_hit is None or seg_en - seg_st < ccs_hit.q_en - ccs_hit.q_st:
            continue

        reads_cnt['ccs_mapped'] += 1

        # Find back-spliced junction site
        circ, junc = find_bsj(ccs)

        # Candidate alignment situation, more than 85%
        circ_hit = get_primary_alignment(env.ALIGNER.map(circ))
        if circ_hit is None:
            continue

        clipped_circ, circ_start, circ_end, clip_info = align_clip_segments(
            circ, circ_hit)
        if circ_start is None or circ_end is None:
            continue

        clip_base = clip_info[2]
        if clip_base > 0.15 * len(ccs) or clip_base > 20:
            continue

        reads_cnt['bsj'] += 1

        # Retrive circRNA positions, convert minimap2 position to real position
        host_strand = find_host_gene(circ_hit.ctg, circ_start, circ_end)
        ss_site, us_free, ds_free, tmp_signal = find_annotated_signal(
            circ_hit.ctg, circ_start, circ_end, clip_base, clip_base + 10)
        if ss_site is None:
            ss_site = find_denovo_signal(circ_hit.ctg, circ_start, circ_end,
                                         host_strand, tmp_signal, us_free,
                                         ds_free, clip_base, clip_base + 10, 3,
                                         True)

        if ss_site is None:
            ss_id = 'NA'
            strand = 'NA'
            correction_shift = 0
        else:
            reads_cnt['signal'] += 1
            ss_id, strand, us_shift, ds_shift = ss_site
            circ_start += us_shift
            circ_end += ds_shift
            correction_shift = min(max(us_shift, us_free), ds_free)

        circ_id = '{}:{}-{}'.format(circ_hit.ctg, circ_start + 1, circ_end)

        # Get Cirexons
        cir_exons = get_blocks(circ_hit)
        cir_exons = merge_clip_exon(cir_exons, clip_info)

        cir_exons[0][0] = circ_start
        cir_exons[-1][1] = circ_end

        cir_exon_tag = []
        for cir_exon_start, cir_exon_end, cir_exon_length in cir_exons:
            cir_exon_tag.append('{}-{}|{}'.format(cir_exon_start + 1,
                                                  cir_exon_end,
                                                  cir_exon_length))

        # BSJ correction for 5' prime region
        circ_seq = clipped_circ if circ_hit.strand > 0 else revcomp(
            clipped_circ)
        circ_seq = circ_seq[correction_shift:] + circ_seq[:correction_shift]

        ret.append((read_id, circ_id, strand, ','.join(cir_exon_tag), ss_id,
                    '{}|{}-{}'.format(junc, clip_base,
                                      len(circ)), segments, circ_seq))

    return reads_cnt, ret
Beispiel #4
0
def search_splice_signal(contig,
                         start,
                         end,
                         clip_base,
                         search_length=10,
                         shift_threshold=3):
    # Find free sliding region
    # start | real_start <-> end | real_end
    ds_free = 0
    for i in range(100):
        if end + i > env.CONTIG_LEN[contig]:
            break
        if env.GENOME.seq(contig, start,
                          start + i) == env.GENOME.seq(contig, end, end + i):
            ds_free = i
        else:
            break

    us_free = 0
    for j in range(100):
        if start - j < 0:
            break
        if env.GENOME.seq(contig, start - j,
                          start) == env.GENOME.seq(contig, end - j, end):
            us_free = j
        else:
            break

    if start - search_length - us_free - 2 < 0 or end + search_length + ds_free + 2 > env.CONTIG_LEN[
            contig]:
        return None, us_free, ds_free
    # Splice site: site_id, strand, us_shift, ds_shift, site_weight, altered_len, altered_total
    # First: Find flanking junction from annotation gtf
    if env.SS_INDEX is not None:
        anno_ss = []
        for strand in ['+', '-']:
            tmp_us_sites = []
            for us_shift in range(-search_length, search_length):
                us_pos = start + us_shift
                if contig in env.SS_INDEX and us_pos in env.SS_INDEX[
                        contig] and strand in env.SS_INDEX[contig][us_pos]:
                    tmp_us_sites.append(us_shift - 1)

            tmp_ds_sites = []
            for ds_shift in range(-search_length, search_length):
                ds_pos = end + ds_shift
                if contig in env.SS_INDEX and ds_pos in env.SS_INDEX[
                        contig] and strand in env.SS_INDEX[contig][ds_pos]:
                    tmp_ds_sites.append(ds_shift)

            if len(tmp_us_sites) == 0 or len(tmp_ds_sites) == 0:
                continue

            for i in tmp_us_sites:
                for j in tmp_ds_sites:
                    if abs(i - j) > shift_threshold + clip_base:
                        continue
                    us_ss = env.GENOME.seq(contig, start + i - 2, start + i)
                    ds_ss = env.GENOME.seq(contig, end + j, end + j + 2)
                    if strand == '-':
                        us_ss, ds_ss = revcomp(ds_ss), revcomp(us_ss)
                    ss_id = '{}-{}|{}-{}'.format(us_ss, ds_ss, i, j)
                    ss_weight = SPLICE_SIGNAL[(ds_ss, us_ss)] if (
                        ds_ss, us_ss) in SPLICE_SIGNAL else 3

                    anno_ss.append((ss_id, strand, i, j, ss_weight,
                                    *get_ss_altered_length(
                                        i, j, us_free, ds_free, clip_base)))

        if len(anno_ss) > 0:
            return sort_ss(anno_ss, us_free, ds_free,
                           clip_base), us_free, ds_free

    # Second: Find Denovo BSJ using pre-defined splice signal
    us_search_length = search_length + us_free
    ds_search_length = search_length + ds_free
    us_seq = env.GENOME.seq(contig, start - us_search_length - 2,
                            start + ds_search_length)
    ds_seq = env.GENOME.seq(contig, end - us_search_length,
                            end + ds_search_length + 2)

    if us_seq is None or len(us_seq) < ds_search_length - us_search_length + 2:
        return None, us_free, ds_free
    if ds_seq is None or len(ds_seq) < ds_search_length - us_search_length + 2:
        return None, us_free, ds_free

    putative_ss = []
    for strand in ['+', '-']:
        for (tmp_ds_ss, tmp_us_ss), ss_weight in SPLICE_SIGNAL.items():
            if strand == '-':
                ds_ss, us_ss = revcomp(tmp_us_ss), revcomp(tmp_ds_ss)
            else:
                ds_ss, us_ss = tmp_ds_ss, tmp_us_ss

            # Find upstream signal
            tmp_us_start = 0
            tmp_us_sites = []
            while 1:
                tmp_us = us_seq.find(us_ss, tmp_us_start + 1)
                if tmp_us == -1:
                    break
                tmp_us_sites.append(tmp_us)
                tmp_us_start = tmp_us

            # Find downstream signal
            tmp_ds_start = 0
            tmp_ds_sites = []
            while 1:
                tmp_ds = ds_seq.find(ds_ss, tmp_ds_start + 1)
                if tmp_ds == -1:
                    break
                tmp_ds_sites.append(tmp_ds)
                tmp_ds_start = tmp_ds

            # Filter paired splice signal in concordance position
            if len(tmp_us_sites) == 0 or len(tmp_ds_sites) == 0:
                continue

            for i in tmp_us_sites:
                for j in tmp_ds_sites:
                    if abs(i - j) > clip_base + shift_threshold:
                        continue
                    us_shift = i - us_search_length
                    ds_shift = j - us_search_length
                    ss_id = '{}-{}*|{}-{}'.format(tmp_us_ss, tmp_ds_ss,
                                                  us_shift, ds_shift)
                    putative_ss.append(
                        (ss_id, strand, us_shift, ds_shift,
                         ss_weight, *get_ss_altered_length(
                             us_shift, ds_shift, us_free, ds_free, clip_base)))

    if len(putative_ss) > 0:
        return sort_ss(putative_ss, us_free, ds_free,
                       clip_base), us_free, ds_free

    return None, us_free, ds_free