def parse_intron(options, chrom, start, end, strand, intron_info): # fetch fasta fa = check_fasta(options['--genome']) intron_fa = dna_to_rna(fa.fetch(chrom, start, end), strand) # parse options phastcons_f = pyBigWig.open(options['--bigwig']) min_distance = int(options['--min-distance']) # start to parse rs sites rs_list = defaultdict(list) for i in product('ATCG', repeat=4): motif = ''.join(i) rs_list[motif].append('x') for m in re.finditer(motif, intron_fa): if strand == '+': pos = start + m.start() + 2 left_dist, right_dist, dist_flag = cal_distance(pos, start, end, min_distance) if not dist_flag: # not enough distance continue else: pos = end - m.start() - 2 left_dist, right_dist, dist_flag = cal_distance(pos, start, end, min_distance) if not dist_flag: # not enough distance continue phastcons = phastcons_f.stats(chrom, pos - 2, pos + 2)[0] if phastcons is None: # no conservation score continue rs_list[motif].append(phastcons) intron_length = end - start - 2 * min_distance return(rs_list, intron_length)
def parse_intron(options, chrom, start, end, strand, intron_info): # fetch fasta fa = check_fasta(options['--genome']) intron_fa = dna_to_rna(fa.fetch(chrom, start, end), strand) # parse options motif = options['-m'] phastcons_f = pyBigWig.open(options['--bigwig']) min_distance = int(options['--min-distance']) min_phastcons = float(options['--min-phastcons']) # start to parse rs sites rs_list = [] for m in re.finditer(motif, intron_fa): if strand == '+': pos = start + m.start() + 2 left_dist, right_dist, dist_flag = cal_distance( pos, start, end, min_distance) if not dist_flag: # not enough distance continue else: pos = end - m.start() - 2 left_dist, right_dist, dist_flag = cal_distance( pos, start, end, min_distance) if not dist_flag: # not enough distance continue phastcons = phastcons_f.stats(chrom, pos - 2, pos + 2)[0] if phastcons is None or phastcons < min_phastcons: # not conserved continue rs_feature = '%d|%d|%d|%f' % (pos, left_dist, right_dist, phastcons) rs_list.append(rs_feature) if rs_list: return (intron_info, rs_list) else: return (None, None)
def parse_intron(options, chrom, start, end, strand, intron_info): # fetch fasta fa = check_fasta(options['--genome']) intron_fa = dna_to_rna(fa.fetch(chrom, start, end), strand) # load matrix matrix3 = load_matrix3() # parse options phastcons_f = pyBigWig.open(options['--bigwig']) min_distance = int(options['--min-distance']) min_score = float(options['--min-score']) min_phastcons = float(options['--min-phastcons']) # start to parse rs sites rs_list = [] for m in re.finditer('AGGT', intron_fa): if strand == '+': pos = start + m.start() + 2 left_dist, right_dist, dist_flag = cal_distance( pos, start, end, min_distance) if not dist_flag: # not enough distance continue ss3_seq = dna_to_rna(fa.fetch(chrom, pos - 20, pos + 3)) if ss3_seq.find('N') != -1: # ensure there is no N continue ss3, score_flag = cal_score(ss3_seq, matrix3, min_score) if not score_flag: # not high score continue else: pos = end - m.start() - 2 left_dist, right_dist, dist_flag = cal_distance( pos, start, end, min_distance) if not dist_flag: # not enough distance continue ss3_seq = dna_to_rna(fa.fetch(chrom, pos - 3, pos + 20), strand='-') if ss3_seq.find('N') != -1: # ensure there is no N continue ss3, score_flag = cal_score(ss3_seq, matrix3, min_score) if not score_flag: # not high score continue phastcons = phastcons_f.stats(chrom, pos - 2, pos + 2)[0] if phastcons is None or phastcons < min_phastcons: # not conserved continue rs_feature = '%d|%d|%d|%f|%f' % (pos, left_dist, right_dist, ss3, phastcons) rs_list.append(rs_feature) if rs_list: return (intron_info, rs_list) else: return (None, None)
def main(): # parse options options = docopt(__doc__, version=__version__) if options['--seq']: if not os.path.isfile(options['--seq']): sys.exit('Error: wrong seq file!') seq = os.path.abspath(options['--seq']) seq_flag = True else: seq = None seq_flag = False fa = check_fasta(options['--genome']) chrom = options['--chrom'] site = int(options['--site']) strand = '+' if options['--strand'] == '1' else '-' rlen = int(options['--read-length']) alen = int(options['--region-length']) clen = int(options['--check-region-length']) thread = options['--thread'] skip_flag = options['--skip-alignment'] # check output directory if not skip_flag: # not skip alignment out_dir = create_dir(options['<out_dir>']) else: # skip alignment out_dir = check_dir(options['<out_dir>']) # build index for sgRNA index_path, offset = build_index(fa, chrom, site, strand, rlen, thread, out_dir, seq, seq_flag) if not skip_flag: # not skip alignment # deal with reads file reads = tempfile.NamedTemporaryFile(mode='w+') if options['-R']: fq_lst = options['-R'].split(',') convert_read(reads, single=fq_lst) else: fq1_lst = options['-1'].split(',') fq2_lst = options['-2'].split(',') convert_read(reads, fq1=fq1_lst, fq2=fq2_lst) reads.seek(0) read_path = reads.name # mapped reads with bowtie2 bam = bowtie2_align(index_path, read_path, thread, out_dir) # remove tempfile reads.close() else: bam = os.path.join(out_dir, 'cs.bam') # fetch cleavage site reads fetch_reads(index_path, offset, alen, clen, bam, out_dir)
def fetch_reads(index, offset, alen, clen, bam, out_dir): cs = os.path.join(out_dir, 'cs_region.txt') count = os.path.join(out_dir, 'cs_count.txt') with open(cs, 'w') as cs_out, open(count, 'w') as count_out: cs_out.write(' ' * alen + 'sgRNA|PAM|\n') index_fa = check_fasta(index).fetch('sgRNA_region') cs_out.write('Reference: ' + index_fa[offset - alen:offset + alen] + '\n') count_out.write('Reference: ' + index_fa[offset - clen:offset + clen] + '\n') cs_count = defaultdict(int) bam_f = pysam.AlignmentFile(bam, 'rb') for read in bam_f: seq = read.query_sequence start = read.reference_start pos = 0 align = insert = ' ' * start for tag, tlen in read.cigartuples: if tag == 0: # M align += seq[pos:pos + tlen] insert += ' ' * tlen pos += tlen elif tag == 1: # I insert += seq[pos:pos + tlen] pos += tlen else: # D align += '*' * tlen cs_out.write('Alignment: ' + align[offset - alen:offset + alen] + '\n') cs_out.write('Insertion: ' + insert[offset - alen:offset + alen] + '\n') cs_id = 'Reads : ' + align[offset - clen:offset + clen] cs_id += '\t%d\n' cs_id += 'Indel :' + insert[offset - clen:offset + clen] + '\n' cs_count[cs_id] += 1 for cs_id in cs_count: count_out.write(cs_id % cs_count[cs_id])