def calc_occupancy_scores(self, factor, motif, tf_concs=[1e-6,]): mean_rvs = [] max_rvs = [] trimmed_atacseq_cov = self.atacseq_cov[len(motif)+1:] atacseq_weights = trimmed_atacseq_cov/trimmed_atacseq_cov.max() for tf_conc in tf_concs: log_tf_conc = numpy.log(tf_conc) raw_occ = logistic( log_tf_conc - self.score_cov[motif.name]/(R*T)) occ = raw_occ*atacseq_weights mean_rv.append(occ.mean()) max_rv.append(occ.max()) return mean_rv, max_rvs
def find_optimal_GFE(motif, pks, chipseq_scores, atacseq_signal): res = [] max_len = max(len(pk) for pk in pks) scores = numpy.zeros((len(pks), max_len+1)) for GFE in numpy.arange(-20, 10, 1.0): motif.build_occupancy_weights(4, GFE) for pk_i, pk in enumerate(pks): score_cov = numpy.array( [score for pos, score in motif.iter_seq_score(pk.seq)]) scores[pk_i, len(motif)+1:] = score_cov res.append(( spearmanr((logistic(-scores/(R*T))*atacseq_signal).mean(1), chipseq_scores)[0], GFE)) print motif.name, GFE, res[-1] max_cor = max(x[0] for x in res) print motif.name, [x[1] for x in res if x[0] == max_cor][0], max_cor print >> sys.stderr, motif.name, [x[1] for x in res if x[0] == max_cor][0], max_cor return (motif.name, [x[1] for x in res if x[0] == max_cor][0], max_cor)
def calc_summary_stats(self): header = [] rv = [] # add on the region and atacseq data header.append('pk_length') rv.append(self.stop - self.start) header.append('ATAC_mean') rv.append(self.atacseq_cov.mean()) header.append('ATAC_max') rv.append(self.atacseq_cov.max()) # find all factors with motif and chip-seq data factors = sorted(set(motif.factor for name, motif in self.motifs.iteritems() ).intersection(self.chipseq_cov.iterkeys())) percentiles = numpy.array( [1e-3, 1e-2, 0.02, 0.05, 0.10, 0.25, 0.50]) for factor in sorted(factors): for BSID, cov in self.chipseq_cov[factor].iteritems(): header.append('%s_%s_mean_ChIPseq_cov' % (factor, BSID)) rv.append(cov.mean()) for motif_name, motif in sorted(self.motifs.iteritems()): # skip motifs that aren't the correct factor if factor != motif.factor: continue header.append('%s_mean_score' % motif_name) rv.append(self.score_cov[motif_name].mean()) header.append('%s_max_score' % motif_name) rv.append(self.score_cov[motif_name].min()) #for percentile, score in self.iter_upper_rank_means( # self.score_cov[motif_name], percentiles): # header.append('%s_q_%.2f_score' % (motif_name, percentile)) # rv.append(score) trimmed_atacseq_cov = self.atacseq_cov[len(motif)+1:] atacseq_weights = trimmed_atacseq_cov/trimmed_atacseq_cov.max() #1000, trimmed_atacseq_cov.max()) w_pwm_scores = self.pwm_cov[motif_name]*atacseq_weights header.append('%s_mean_w_pwm_score' % motif_name) rv.append(w_pwm_scores.mean()) #for percentile, score in self.iter_upper_rank_means( # w_pwm_scores, percentiles): # header.append('%s_q_%.2f_w_pwm_score' % (motif_name, percentile)) # rv.append(score) #header.append('%s_max_w_pwm_score' % motif_name) #rv.append(w_pwm_scores.max()) """ for tf_conc in [1e-30, 1e-20, 1e-15, 1e-10, 1e-7, 1e-5, 1e-2, 1e-1, 1, 1e2, 1e5, 1e7, 1e10, 1e15, 1e20, 1e30 ]: log_tf_conc = numpy.log(tf_conc) raw_occ = logistic( log_tf_conc - self.score_cov[motif.name]/(R*T)) occ = raw_occ*atacseq_weights header.append('%s_%e_mean_occ' % (motif_name, tf_conc)) rv.append(occ.mean()) #header.append('%s_%e_max_occ' % (motif_name, tf_conc)) #rv.append(occ.max()) """ log_tf_conc = numpy.log(1e5) raw_occ = logistic( log_tf_conc - self.score_cov[motif_name]/(R*T)) occ = raw_occ*atacseq_weights header.append('%s_mean_occ' % motif_name) rv.append(occ.mean()) header.append('%s_max_occ' % motif_name) rv.append(occ.max()) """ # XXX # find the raw occupancy that provies the best correpondence # between the signals, and then try and predict these # sequentially unbnd_conc = self.estimate_unbnd_conc_in_region(motif_name) #print self.score_cov[motif_name].mean(), unbnd_conc #print unbnd_conc #unbnd_conc = 0.0 raw_occ = logistic( unbnd_conc + self.score_cov[motif_name]/(R*T)) occ = raw_occ*atacseq_weights header.append('%s_weighted_occ' % motif_name) rv.append(occ.mean()) header.append('%s_unbnd_conc' % motif_name) rv.append(unbnd_conc) """ #for percentile, score in self.iter_upper_rank_means( # occ, percentiles): # header.append('%s_q_%.2f_occ' % (motif_name, percentile)) # rv.append(score) #header.append('%s_max_occ' % motif_name) #rv.append(occ.max()) return header, rv