Exemple #1
0
def parse_intron(options, chrom, start, end, strand, intron_info):
    # fetch fasta
    fa = check_fasta(options['--genome'])
    intron_fa = dna_to_rna(fa.fetch(chrom, start, end), strand)
    # parse options
    phastcons_f = pyBigWig.open(options['--bigwig'])
    min_distance = int(options['--min-distance'])
    # start to parse rs sites
    rs_list = defaultdict(list)
    for i in product('ATCG', repeat=4):
        motif = ''.join(i)
        rs_list[motif].append('x')
        for m in re.finditer(motif, intron_fa):
            if strand == '+':
                pos = start + m.start() + 2
                left_dist, right_dist, dist_flag = cal_distance(pos, start,
                                                                end,
                                                                min_distance)
                if not dist_flag:  # not enough distance
                    continue
            else:
                pos = end - m.start() - 2
                left_dist, right_dist, dist_flag = cal_distance(pos, start,
                                                                end,
                                                                min_distance)
                if not dist_flag:  # not enough distance
                    continue
            phastcons = phastcons_f.stats(chrom, pos - 2, pos + 2)[0]
            if phastcons is None:  # no conservation score
                continue
            rs_list[motif].append(phastcons)
    intron_length = end - start - 2 * min_distance
    return(rs_list, intron_length)
Exemple #2
0
def parse_intron(options, chrom, start, end, strand, intron_info):
    # fetch fasta
    fa = check_fasta(options['--genome'])
    intron_fa = dna_to_rna(fa.fetch(chrom, start, end), strand)
    # parse options
    motif = options['-m']
    phastcons_f = pyBigWig.open(options['--bigwig'])
    min_distance = int(options['--min-distance'])
    min_phastcons = float(options['--min-phastcons'])
    # start to parse rs sites
    rs_list = []
    for m in re.finditer(motif, intron_fa):
        if strand == '+':
            pos = start + m.start() + 2
            left_dist, right_dist, dist_flag = cal_distance(
                pos, start, end, min_distance)
            if not dist_flag:  # not enough distance
                continue
        else:
            pos = end - m.start() - 2
            left_dist, right_dist, dist_flag = cal_distance(
                pos, start, end, min_distance)
            if not dist_flag:  # not enough distance
                continue
        phastcons = phastcons_f.stats(chrom, pos - 2, pos + 2)[0]
        if phastcons is None or phastcons < min_phastcons:  # not conserved
            continue
        rs_feature = '%d|%d|%d|%f' % (pos, left_dist, right_dist, phastcons)
        rs_list.append(rs_feature)
    if rs_list:
        return (intron_info, rs_list)
    else:
        return (None, None)
Exemple #3
0
def parse_intron(options, chrom, start, end, strand, intron_info):
    # fetch fasta
    fa = check_fasta(options['--genome'])
    intron_fa = dna_to_rna(fa.fetch(chrom, start, end), strand)
    # load matrix
    matrix3 = load_matrix3()
    # parse options
    phastcons_f = pyBigWig.open(options['--bigwig'])
    min_distance = int(options['--min-distance'])
    min_score = float(options['--min-score'])
    min_phastcons = float(options['--min-phastcons'])
    # start to parse rs sites
    rs_list = []
    for m in re.finditer('AGGT', intron_fa):
        if strand == '+':
            pos = start + m.start() + 2
            left_dist, right_dist, dist_flag = cal_distance(
                pos, start, end, min_distance)
            if not dist_flag:  # not enough distance
                continue
            ss3_seq = dna_to_rna(fa.fetch(chrom, pos - 20, pos + 3))
            if ss3_seq.find('N') != -1:  # ensure there is no N
                continue
            ss3, score_flag = cal_score(ss3_seq, matrix3, min_score)
            if not score_flag:  # not high score
                continue
        else:
            pos = end - m.start() - 2
            left_dist, right_dist, dist_flag = cal_distance(
                pos, start, end, min_distance)
            if not dist_flag:  # not enough distance
                continue
            ss3_seq = dna_to_rna(fa.fetch(chrom, pos - 3, pos + 20),
                                 strand='-')
            if ss3_seq.find('N') != -1:  # ensure there is no N
                continue
            ss3, score_flag = cal_score(ss3_seq, matrix3, min_score)
            if not score_flag:  # not high score
                continue
        phastcons = phastcons_f.stats(chrom, pos - 2, pos + 2)[0]
        if phastcons is None or phastcons < min_phastcons:  # not conserved
            continue
        rs_feature = '%d|%d|%d|%f|%f' % (pos, left_dist, right_dist, ss3,
                                         phastcons)
        rs_list.append(rs_feature)
    if rs_list:
        return (intron_info, rs_list)
    else:
        return (None, None)
Exemple #4
0
def main():
    # parse options
    options = docopt(__doc__, version=__version__)
    if options['--seq']:
        if not os.path.isfile(options['--seq']):
            sys.exit('Error: wrong seq file!')
        seq = os.path.abspath(options['--seq'])
        seq_flag = True
    else:
        seq = None
        seq_flag = False
    fa = check_fasta(options['--genome'])
    chrom = options['--chrom']
    site = int(options['--site'])
    strand = '+' if options['--strand'] == '1' else '-'
    rlen = int(options['--read-length'])
    alen = int(options['--region-length'])
    clen = int(options['--check-region-length'])
    thread = options['--thread']
    skip_flag = options['--skip-alignment']
    # check output directory
    if not skip_flag:  # not skip alignment
        out_dir = create_dir(options['<out_dir>'])
    else:  # skip alignment
        out_dir = check_dir(options['<out_dir>'])
    # build index for sgRNA
    index_path, offset = build_index(fa, chrom, site, strand, rlen, thread,
                                     out_dir, seq, seq_flag)
    if not skip_flag:  # not skip alignment
        # deal with reads file
        reads = tempfile.NamedTemporaryFile(mode='w+')
        if options['-R']:
            fq_lst = options['-R'].split(',')
            convert_read(reads, single=fq_lst)
        else:
            fq1_lst = options['-1'].split(',')
            fq2_lst = options['-2'].split(',')
            convert_read(reads, fq1=fq1_lst, fq2=fq2_lst)
        reads.seek(0)
        read_path = reads.name
        # mapped reads with bowtie2
        bam = bowtie2_align(index_path, read_path, thread, out_dir)
        # remove tempfile
        reads.close()
    else:
        bam = os.path.join(out_dir, 'cs.bam')
    # fetch cleavage site reads
    fetch_reads(index_path, offset, alen, clen, bam, out_dir)
Exemple #5
0
def fetch_reads(index, offset, alen, clen, bam, out_dir):
    cs = os.path.join(out_dir, 'cs_region.txt')
    count = os.path.join(out_dir, 'cs_count.txt')
    with open(cs, 'w') as cs_out, open(count, 'w') as count_out:
        cs_out.write(' ' * alen + 'sgRNA|PAM|\n')
        index_fa = check_fasta(index).fetch('sgRNA_region')
        cs_out.write('Reference: ' + index_fa[offset - alen:offset + alen] +
                     '\n')
        count_out.write('Reference: ' + index_fa[offset - clen:offset + clen] +
                        '\n')
        cs_count = defaultdict(int)
        bam_f = pysam.AlignmentFile(bam, 'rb')
        for read in bam_f:
            seq = read.query_sequence
            start = read.reference_start
            pos = 0
            align = insert = ' ' * start
            for tag, tlen in read.cigartuples:
                if tag == 0:  # M
                    align += seq[pos:pos + tlen]
                    insert += ' ' * tlen
                    pos += tlen
                elif tag == 1:  # I
                    insert += seq[pos:pos + tlen]
                    pos += tlen
                else:  # D
                    align += '*' * tlen
            cs_out.write('Alignment: ' + align[offset - alen:offset + alen] +
                         '\n')
            cs_out.write('Insertion: ' + insert[offset - alen:offset + alen] +
                         '\n')
            cs_id = 'Reads    : ' + align[offset - clen:offset + clen]
            cs_id += '\t%d\n'
            cs_id += 'Indel    :' + insert[offset - clen:offset + clen] + '\n'
            cs_count[cs_id] += 1
        for cs_id in cs_count:
            count_out.write(cs_id % cs_count[cs_id])