def submit(self,gene_list,gene_file=None,**kwargs) : if gene_list == 'example' : example_fn = resource_filename('adipo_sight','data/p65_bound_gene_list.txt') gene_list = open(example_fn).read() self.parse_kwargs(kwargs) log.debug(self.args.get('gene_file')) cherrypy.log('gene_file:%s'%str(gene_file)) if (gene_file is not None and gene_file.file is not None) : gene_list_str = gene_file.file.read() else : gene_list_str = gene_list gene_list_strip = re.sub(r'\s+',' ',gene_list_str) cherrypy.log('user submitted genes: %s'%gene_list_strip) gene_list = gene_list_strip.split() # make sure the genes are unique gene_list = list(set(gene_list)) orig_gene_list = gene_list # make all genes lower case as they are in the db gene_list = [g.lower() for g in gene_list] # sort both the original and lower()ed gene lists in parallel, # put them back both_gene_lists = zip(gene_list,orig_gene_list) both_gene_lists.sort() gene_list, orig_gene_list = zip(*both_gene_lists) self.gene_name_map = dict(zip(gene_list,orig_gene_list)+zip(orig_gene_list,gene_list)) hid = str(hash(''.join(gene_list))) session_d = self.sessions.get(hid) session_d['hid'] = hid session_d['gene_list'] = gene_list session_d['orig_gene_list'] = orig_gene_list session_d['gene_name_map'] = self.gene_name_map self.gene_list = gene_list self.orig_gene_list = orig_gene_list self.sessions.save(hid,session_d) # now that we've set everything, redirect to the processing page raise cherrypy.HTTPRedirect("http://fraenkel.mit.edu/adipo_sight/processing?hid=%s"%hid,status=303)
def run(self) : session_d = self.session_d if session_d is None : log.error('hid passed but no session found, aborting') return 'Error occurred, please try submitting again' gene_list = session_d['gene_list'] d = {} # get motif scores cherrypy.log('getting motif scores') # sqlite has a limitation of 999 SQL variables, need to split up # this query batch_size = 500 gene_list_batches = [gene_list[i:i+500] for i in xrange(0,len(gene_list),500)] cherrypy.log('gene_list_batches len()s: %s'%str([len(b) for b in gene_list_batches])) scores = [] for batch in gene_list_batches : score_q = (self.db_session.query(db.Region,db.SeqData) .join(db.RegionMembership) .join(db.RegionSet) .join(db.SeqData) .filter(db.SeqData.seq_type.has(db.SeqType.name=='motif scores')) .filter(db.RegionSet.name.in_(batch)) .filter(db.RegionMembership.dist_to_feature.between( -int(self.args['upstream']), int(self.args['downstream']) )) ) scores.extend(score_q.all()) score_mat = [] gene_names = set() condition_scores = defaultdict(list) cherrypy.log('loading motif scores') for region, seqdata in scores : for region_membership in region.membership : region_set = region_membership.region_set gene_names.add(region_set.name) condition = seqdata.condition.name motif_scores = cPickle.loads(seqdata.value) condition_scores[condition].append(motif_scores) score_mat.append(motif_scores) cherrypy.log('found conditions: %s'%condition_scores.keys()) score_mat = np.array(score_mat) session_d['found'] = gene_names session_d['missing'] = [g for g in gene_list if g not in gene_names] # compare motif scores of requested genes to all hypersensitive regions # in the dataset cherrypy.log('loading motif background') # pick n random background sequences, but seed so the same indices are picked for every # different # of input DHS regions random.seed("jo mama") # walk through the conditions and compute scores sig_scores = defaultdict(dict) hs_regions = {} enriched_motifs = {} for c, scores in condition_scores.items() : score_mat = np.array(scores) hs_regions[c] = len(scores) # get the background out for this condition motif_bg_fn = resource_filename('adipo_sight','data/%s_hypersensitive_peaks_bg_motif_scores.npy'%c) all_scores = np.load(motif_bg_fn).T bg_inds = random.sample(xrange(all_scores.shape[0]),min(score_mat.shape[0],all_scores.shape[0])) all_scores = all_scores[bg_inds,:] cherrypy.log('score_mat.shape = %s'%str(score_mat.shape)) cherrypy.log('all_scores.shape = %s'%str(all_scores.shape)) cherrypy.log('done loading motif background') # calculate MWW pvals = mww_multiprocess(score_mat.T,all_scores.T,True) log.debug('motif scores for condition: %s'%c) log.debug('pvals: %s'%str(pvals.shape)) motif_name_fn = resource_filename('adipo_sight','data/motif_names.txt') motif_names = np.array(open(motif_name_fn).readlines()) motif_cluster_fn = resource_filename('adipo_sight','data/motif_clusters.txt') motif_cluster_map = dict((i,int(m)) for i,m in enumerate(open(motif_cluster_fn))) thresh = pvals < self.args.get('diff_hyp_pval') thresh_inds = np.where(thresh)[0] thresh_names, thresh_pvals = motif_names[thresh], pvals[thresh] thresh_imgs = np.array(['images/motif_logos/%03d_motif.png'%i for i in thresh_inds]) cluster_set = set() for i,n,p in zip(thresh_inds,thresh_names,thresh_pvals) : cluster_i = motif_cluster_map[i] sig_scores[cluster_i].setdefault('name',set()).add(n.strip()) sig_scores[cluster_i][c] = min(sig_scores[cluster_i].get(c,1.),p) cluster_set.add(cluster_i) enriched_motifs[c] = len(cluster_set) d['motifs'] = dict(sig_scores) d['hs_regions'] = hs_regions d['enriched_motifs'] = enriched_motifs session_d['motif_enrich'] = d