def calculate_conversions(loci, samples, max_k=1, antisense=False, genome_fa=None, read_select_fn=None): """ conv[k-1][sampleID][featID][base][position] = [(wt, mut, other), ...] where k = number of linked bases """ try: chr_seqs = ru.load_chr_seqs(genome_fa) except IOError: sys.exit('***ERROR: Genome fasta not found: %s, exiting' % genome_fa) all_profiles = [] bases = ['C', 'G'] for k_index in range(max_k): k = k_index + 1 profiles = {} for counter, (feat_id, feat_data) in enumerate(loci.items()): chr, strand, exons = feat_data[:3] start = exons[0][0] end = exons[-1][1] coord = '%s:%d-%d' % (chr, start, end) tx_coords = ru.tx_indexing(exons, strand == '-') tx_coords_inv = ru.tx_indexing(exons, strand == '-', inverse=True) ref_chr_seq = chr_seqs[chr] if antisense: strand = ru.anti_strand_str[strand] for sample_label, bamfile_dict in sorted(samples.items()): if sample_label not in profiles: profiles[sample_label] = {} if feat_id not in profiles[sample_label]: profiles[sample_label][feat_id] = {} profiles[sample_label][feat_id]['C'] = [ [] for i in range(1 + max(tx_coords.values())) ] profiles[sample_label][feat_id]['G'] = [ [] for i in range(1 + max(tx_coords.values())) ] bamfiles = bamfile_dict[strand] for btuple in bamfiles: for bamfile, base in zip(btuple, bases): conv = conversion_blocks(base, [bamfile], coord, strand, ref_chr_seq, SAMFLAGS[strand], read_select_fn, k=k, Fsamflag=FSAMFLAGS[strand]) for i, x in enumerate( profiles[sample_label][feat_id][base]): genomic_coord = tx_coords_inv.get(i, -1) if genomic_coord > -1: profiles[sample_label][feat_id][base][ i].append( conv.get(genomic_coord, [0, 0, 0])) #Status counter if counter % 50 == 0: sys.stderr.write( '%d out of %d regions completed for linked_bases = %d\n' % (counter, len(loci), k)) all_profiles.append(profiles) return all_profiles
def calculate_frag_stats(loci, samples, buffer=75, antisense=False): """ Calculates the count, mean, median, sd of apparent fragment length per locus Return is a tuple of dict: frag_stats[sampleID][featID] = (N, mean, median, sd) and combined_stats[sampleID] = (N, mean, median, sd) """ frag_lengths = {} #per feature per sample combined_lengths = {} #pooled over all features per sample frag_stats = {} combined_stats = {} combined_hist = {} frag_counts = {} #per feature per sample for k, (feat_id, feat_data) in enumerate(loci.items()): chr, strand, exons = feat_data[:3] start = exons[0][0] end = exons[-1][1] coord = '%s:%d-%d' % (chr, start, end) tx_coords = ru.tx_indexing(exons, strand == '-') tx_coords_inv = ru.tx_indexing(exons, strand == '-', inverse=True) feat_len = max(tx_coords_inv.keys()) if antisense: strand = ru.anti_strand_str[strand] for sample_label, bamfile_dict in sorted(samples.items()): if sample_label not in frag_lengths: frag_lengths[sample_label] = {} combined_lengths[sample_label] = [] frag_stats[sample_label] = {} combined_stats[sample_label] = [] combined_hist[sample_label] = {} frag_counts[sample_label] = {} if feat_id not in frag_lengths[sample_label]: frag_lengths[sample_label][feat_id] = [] frag_counts[sample_label][feat_id] = {} bamfiles = [ bfile for btuple in bamfile_dict[strand] for bfile in btuple ] length_list = [] samflags = SAMFLAGS[strand] Fsamflag = FSAMFLAGS[strand] gc = tx_pileup(bamfiles, coord, tx_coords, samflags, raw_frags=True, Fsamflag=Fsamflag) for s, e in gc: if s > buffer and e < feat_len - buffer: length_list.append(e - s) frag_counts[sample_label][feat_id][( s, e)] = frag_counts[sample_label][feat_id].get( (s, e), 0) + 1 #update frag_lengths[sample_label][feat_id] += length_list combined_lengths[sample_label] += length_list #Statistics per feature for sample_label in frag_lengths.keys(): a = frag_lengths[sample_label][feat_id] if len(a) > 0: frag_stats[sample_label][feat_id] = (len(a), np.mean(a), np.median(a), np.std(a)) else: frag_stats[sample_label][feat_id] = (0, float('nan'), float('nan'), float('nan')) #Number of times each fragment appears a = list(frag_counts[sample_label][feat_id].values()) frag_counts[sample_label][feat_id] = np.bincount(a) #Statistics per sample for sample_label, numbers in combined_lengths.items(): if len(numbers) > 0: combined_stats[sample_label] = (len(numbers), np.mean(numbers), np.median(numbers), np.std(numbers)) else: combined_stats[sample_label] = (0, float('nan'), float('nan'), float('nan')) combined_hist[sample_label] = np.bincount(numbers) return frag_stats, combined_stats, combined_hist, frag_counts
def joint_conversions(loci, profiles, expt, ctrl, outfile=None, z_thresh=1, g_fdr=0.05, find_depletions=True): """ E.g., mono, di, tri base alleles simultaneously and non-redundantly. """ if outfile: fw = open(outfile, 'w') convs = [] try: for i, profile in enumerate(profiles): convs.append( conversions(loci, profile, expt, ctrl, z_thresh=z_thresh, g_fdr=g_fdr, mult=i + 1, find_depletions=find_depletions)) except: sys.exit( '***ERROR: Improperly formatted data file? (should be a _conversions_ pickle)' ) # fields = ['feat_id', 'label', 'coord', 'strand', 'base', 'mult', 'pos', 'z_expt', 'log_diff', 'g_pool', 'g_pool_p', 'g_pool_padj', 'g_het_expt', 'g_het_expt_p', 'g_het_expt_padj', 'g_het_ctrl', 'g_het_ctrl_p', 'g_het_ctrl_padj', 'seq'] fields = [ 'feat_id', 'label', 'coord', 'strand', 'base', 'linked_bases', 'pos', 'z_expt', 'log_diff', 'g_pool', 'g_pool_p', 'g_pool_padj', 'seq' ] if outfile: fw.write('\t'.join(fields) + '\n') joint_convs = {} for feat_id, feat_data in loci.items(): chr, strand, exons, seq = feat_data tx_coords = ru.tx_indexing(exons, strand == '-', inverse=True) positions = {} for conv_dict in convs: for conv in conv_dict[feat_id]: for index in conv['pos']: if index not in positions: positions[index] = [] positions[index].append((conv['g_pool_padj'], conv)) filtered_positions = {} for index, conv_list in positions.items(): #Keep the conversion with lowest p value conv_list.sort(key=lambda x: x[0]) best = conv_list[0][1] if best['label'] not in filtered_positions: filtered_positions[best['label']] = (best['pos'], best) for pval, conv in conv_list[1:]: filtered_positions[conv['label']] = ((-1, ), None) joint_convs[feat_id] = [] for index, conv in sorted(filtered_positions.values()): if conv: #Genomic coordinate g_start = tx_coords[conv['pos'][0]] g_end = tx_coords[conv['pos'][-1]] if g_start > g_end: g_start, g_end = g_end, g_start conv['coord'] = '%s:%d-%d' % (chr, g_start + 1, g_end + 1) conv['strand'] = strand joint_convs[feat_id].append(conv) if outfile: out_line = [ru.pretty_str(conv[x]) for x in fields] fw.write('\t'.join(out_line) + '\n') if outfile: fw.close() return joint_convs
def calculate_coverage(loci, samples, antisense=False, frag_sizes=None, sampling_rate=1, paired=True, rmdup_strictest=False, dup_limit=0, strict_dup_filtering=False): """ For each sample, calculates coverage spanning each read pair (including the insert). Insert is assumed to be the region from the end of read 1 to start of read 2, ignoring any potential intron. For bisulfite, coverage is calculated separately for each conversion event. Coverage is stored in the profiles dict. profiles[sampleID][featID][position] = count_list Totals are totals[sampleID] = [+ counts, - counts] """ profiles = {} total_counts = {} for k, (feat_id, feat_data) in enumerate(loci.items()): chr, strand, exons = feat_data[:3] start = exons[0][0] end = exons[-1][1] coord = '%s:%d-%d' % (chr, start, end) tx_coords = ru.tx_indexing(exons, strand == '-') if antisense: strand = ru.anti_strand_str[strand] for sample_label, bamfile_dict in sorted(samples.items()): if sample_label not in profiles: profiles[sample_label] = {} total_counts[sample_label] = [[], []] #strand separated +, - if feat_id not in profiles[sample_label]: profiles[sample_label][feat_id] = [ [] for i in range(1 + max(tx_coords.values())) ] bamfiles = [ bfile for btuple in bamfile_dict[strand] for bfile in btuple ] for bamfile in bamfiles: sample_cov = tx_pileup( [bamfile], coord, tx_coords, SAMFLAGS[strand], frag_sizes=frag_sizes, dup_limit=dup_limit, strict_dup_filtering=strict_dup_filtering, sampling_rate=sampling_rate, Fsamflag=FSAMFLAGS[strand]) for i, x in enumerate(profiles[sample_label][feat_id]): profiles[sample_label][feat_id][i].append(sample_cov[i]) #Update total counts if strand == '+': j = 0 else: j = 1 for sample_label, feat_dict in profiles.items(): for counts in feat_dict[feat_id]: if not counts: continue if len(total_counts[sample_label][j]) == 0: total_counts[sample_label][j] = np.array(counts) else: total_counts[sample_label][j] = np.add( total_counts[sample_label][j], counts) #Status counter if k % 50 == 0: sys.stderr.write('%d out of %d regions completed\n' % (k, len(loci))) return profiles, total_counts
def region_finder(loci, profiles, expt, ctrl, ratio_fn=profile_ratio, count_fn=raw_profile_replicates, motifs=None, lfc_thresh=math.log(1.25, 2), low_conf_lfc_thresh=math.log(1.25, 2), min_peak_width=5, min_ctrl_coverage=0, min_expt_coverage=0, outfile_root='', peaks_only=False, do_bedgraph=True, write_bedgraph_header=True): """ Iterate through ratios to identify responsive regions (peaks/valleys) Minimal return: regions[feat_id] = [(start, end, label), ...] """ regions = {} ratio_cache = [] try: pr = ratio_fn(loci, profiles, expt, ctrl) except: sys.exit( '***ERROR: Improperly formatted data file? (should be a _coverage_ pickle)' ) for feat_id, feat_data in loci.items(): chr, strand, exons, seq = feat_data tx_coords = ru.tx_indexing(exons, strand == '-', inverse=True) #Find responsive regions rr = responsive_regions(pr[feat_id], threshold=low_conf_lfc_thresh) #Update the ratio list if do_bedgraph: ratio_cache += bedgraph_entries(chr, pr[feat_id], tx_coords) #Identify motif locations feat_motifs = motif_locs(seq, motifs, window=1) #Get raw counts all_expt_counts = count_fn(feat_id, profiles, expt) all_ctrl_counts = count_fn(feat_id, profiles, ctrl) #For each responsive region, calculate raw reads for r_data in rr: #Get raw counts at the critical point r_data['expt_counts'] = all_expt_counts[r_data['crit_pt']] r_data['ctrl_counts'] = all_ctrl_counts[r_data['crit_pt']] r_data['sum_expt_counts'] = sum(r_data['expt_counts']) r_data['sum_ctrl_counts'] = sum(r_data['ctrl_counts']) if r_data['sum_ctrl_counts'] < min_ctrl_coverage or \ r_data['sum_expt_counts'] < min_expt_coverage or \ r_data['width'] < min_peak_width or \ abs(r_data['value']) < lfc_thresh or \ (peaks_only and r_data['type'] == 'valley'): continue r_data['expt'] = expt r_data['ctrl'] = ctrl r_data['feat_id'] = feat_id r_data['long_label'] = r_data['feat_id'] + '_' + r_data['label'] r_data['dist_from_end'] = min(r_data['bound'][0] - 1, len(seq) - r_data['bound'][1] - 1) r_data['crit_pt_dist_from_end'] = min(r_data['crit_pt'], len(seq) - r_data['crit_pt']) g_start, g_end = tx_coords[r_data['bound'][0]], tx_coords[ r_data['bound'][1]] if strand == '-': g_start, g_end = g_end, g_start r_data['chr'] = chr r_data['genomic_start'] = g_start r_data['genomic_end'] = g_end + 1 r_data['strand'] = strand r_data['coord'] = '%s:%d-%d' % (chr, g_start + 1, g_end + 1) r_data['seq'] = seq[r_data['bound'][0]:(1 + r_data['bound'][1])] #Get the sequence of the summit flank_amt = 10 summit_seqs = [] min_s = len(seq) max_e = 0 for s, e in r_data['summit']: sseq = seq[s:e] l_bound = max(0, s - flank_amt) flank_l = seq[l_bound:s].lower() r_bound = min(e + flank_amt, len(seq)) flank_r = seq[e:r_bound].lower() summit_seqs.append(flank_l + sseq + flank_r) if l_bound < min_s: min_s = l_bound if r_bound > max_e: max_e = r_bound r_data['summit_seq'] = summit_seqs #15nt +/- flank_addition = 15 r_data['spanning_summit_seq'] = seq[ max(0, min_s - flank_addition):min(max_e + flank_addition, len(seq))] #Annotate motifs overlapping_motifs = [] for s, e, short_name, m in feat_motifs: if r_data['bound'][0] < s and e < r_data['bound'][1]: overlapping_motifs.append((s, short_name)) r_data['motifs'] = overlapping_motifs if feat_id not in regions: regions[feat_id] = [] regions[feat_id].append(r_data) if outfile_root: output_region_data(regions, outfile_root + '.txt') output_region_bed(regions, outfile_root + '.bed') output_region_seq(regions, outfile_root, do_valley=not peaks_only) if do_bedgraph: if write_bedgraph_header: header_name = header_name = '%s/%s' % (expt, ctrl) else: header_name = '' output_region_bedgraph(ratio_cache, outfile_root + '.bedgraph', header_name=header_name) return regions