def fetch_read2(bam, chrom): read2_lst = {} for read in bam.fetch(chrom): # not read1 or secondary alignment if read.is_read1 or read.is_secondary: continue if not read.is_proper_pair: # not proper pair continue if read.get_tag('NH') != 1: # not unique read continue chrom = read.reference_name mate_chrom = read.next_reference_name if chrom != mate_chrom: # not same chromosome continue if not read.is_reverse and read.mate_is_reverse: strand = '+' mate_strand = '-' elif read.is_reverse and not read.mate_is_reverse: strand = '-' mate_strand = '+' else: continue mate_pos = str(read.next_reference_start) name = read.query_name start = str(read.reference_start) end = str(read.reference_end) if strand == '+': barcode = dna_to_rna(read.query_sequence[:15]) else: barcode = dna_to_rna(read.query_sequence[-15:], strand=strand) read_id = '\t'.join([name, mate_chrom, mate_pos, mate_strand]) read2_lst[read_id] = [chrom, start, end, barcode] return read2_lst
def parse_intron(options, chrom, start, end, strand, intron_info): # fetch fasta fa = check_fasta(options['--genome']) intron_fa = dna_to_rna(fa.fetch(chrom, start, end), strand) # load matrix matrix3 = load_matrix3() # parse options phastcons_f = pyBigWig.open(options['--bigwig']) min_distance = int(options['--min-distance']) min_score = float(options['--min-score']) min_phastcons = float(options['--min-phastcons']) # start to parse rs sites rs_list = [] for m in re.finditer('AGGT', intron_fa): if strand == '+': pos = start + m.start() + 2 left_dist, right_dist, dist_flag = cal_distance( pos, start, end, min_distance) if not dist_flag: # not enough distance continue ss3_seq = dna_to_rna(fa.fetch(chrom, pos - 20, pos + 3)) if ss3_seq.find('N') != -1: # ensure there is no N continue ss3, score_flag = cal_score(ss3_seq, matrix3, min_score) if not score_flag: # not high score continue else: pos = end - m.start() - 2 left_dist, right_dist, dist_flag = cal_distance( pos, start, end, min_distance) if not dist_flag: # not enough distance continue ss3_seq = dna_to_rna(fa.fetch(chrom, pos - 3, pos + 20), strand='-') if ss3_seq.find('N') != -1: # ensure there is no N continue ss3, score_flag = cal_score(ss3_seq, matrix3, min_score) if not score_flag: # not high score continue phastcons = phastcons_f.stats(chrom, pos - 2, pos + 2)[0] if phastcons is None or phastcons < min_phastcons: # not conserved continue rs_feature = '%d|%d|%d|%f|%f' % (pos, left_dist, right_dist, ss3, phastcons) rs_list.append(rs_feature) if rs_list: return (intron_info, rs_list) else: return (None, None)
def parse_intron(options, chrom, start, end, strand, intron_info): # fetch fasta fa = check_fasta(options['--genome']) intron_fa = dna_to_rna(fa.fetch(chrom, start, end), strand) # parse options phastcons_f = pyBigWig.open(options['--bigwig']) min_distance = int(options['--min-distance']) # start to parse rs sites rs_list = defaultdict(list) for i in product('ATCG', repeat=4): motif = ''.join(i) rs_list[motif].append('x') for m in re.finditer(motif, intron_fa): if strand == '+': pos = start + m.start() + 2 left_dist, right_dist, dist_flag = cal_distance(pos, start, end, min_distance) if not dist_flag: # not enough distance continue else: pos = end - m.start() - 2 left_dist, right_dist, dist_flag = cal_distance(pos, start, end, min_distance) if not dist_flag: # not enough distance continue phastcons = phastcons_f.stats(chrom, pos - 2, pos + 2)[0] if phastcons is None: # no conservation score continue rs_list[motif].append(phastcons) intron_length = end - start - 2 * min_distance return(rs_list, intron_length)
def parse_intron(options, chrom, start, end, strand, intron_info): # fetch fasta fa = check_fasta(options['--genome']) intron_fa = dna_to_rna(fa.fetch(chrom, start, end), strand) # parse options motif = options['-m'] phastcons_f = pyBigWig.open(options['--bigwig']) min_distance = int(options['--min-distance']) min_phastcons = float(options['--min-phastcons']) # start to parse rs sites rs_list = [] for m in re.finditer(motif, intron_fa): if strand == '+': pos = start + m.start() + 2 left_dist, right_dist, dist_flag = cal_distance( pos, start, end, min_distance) if not dist_flag: # not enough distance continue else: pos = end - m.start() - 2 left_dist, right_dist, dist_flag = cal_distance( pos, start, end, min_distance) if not dist_flag: # not enough distance continue phastcons = phastcons_f.stats(chrom, pos - 2, pos + 2)[0] if phastcons is None or phastcons < min_phastcons: # not conserved continue rs_feature = '%d|%d|%d|%f' % (pos, left_dist, right_dist, phastcons) rs_list.append(rs_feature) if rs_list: return (intron_info, rs_list) else: return (None, None)
def build_index(fa, chrom, site, strand, rlen, thread, out_dir, seq, seq_flag): print('Build index...') if strand == '+': start = site - (rlen - 10) end = site + (rlen - 20) offset = rlen - 10 else: start = site - (rlen - 20) end = site + (rlen - 10) offset = rlen - 20 index_path = os.path.join(out_dir, 'sgRNA.fa') if seq_flag: os.symlink(seq, index_path) else: # fetch sgRNA region sequence with open(index_path, 'w') as out: out.write('>sgRNA_region\n') out.write( dna_to_rna(fa.fetch(chrom, start, end), strand=strand) + '\n') # build index if which('bowtie2-build'): command = 'bowtie2-build -q --threads %s %s %s' command = command % (thread, index_path, index_path) run_command(command, 'Error: cannot build index for sgRNA!') else: sys.exit('Error: no bowtie2-build installed!') return index_path, offset