Example #1
0
    def submit(self,gene_list,gene_file=None,**kwargs) :

        if gene_list == 'example' :
            example_fn = resource_filename('adipo_sight','data/p65_bound_gene_list.txt')
            gene_list = open(example_fn).read()

        self.parse_kwargs(kwargs)

        log.debug(self.args.get('gene_file'))
        cherrypy.log('gene_file:%s'%str(gene_file))
        if (gene_file is not None and
            gene_file.file is not None) :
            gene_list_str = gene_file.file.read()
        else :
            gene_list_str = gene_list

        gene_list_strip = re.sub(r'\s+',' ',gene_list_str)
        cherrypy.log('user submitted genes: %s'%gene_list_strip)

        gene_list = gene_list_strip.split()

        # make sure the genes are unique
        gene_list = list(set(gene_list))
        orig_gene_list = gene_list

        # make all genes lower case as they are in the db
        gene_list = [g.lower() for g in gene_list]

        # sort both the original and lower()ed gene lists in parallel,
        # put them back
        both_gene_lists = zip(gene_list,orig_gene_list)
        both_gene_lists.sort()
        gene_list, orig_gene_list = zip(*both_gene_lists)
        self.gene_name_map = dict(zip(gene_list,orig_gene_list)+zip(orig_gene_list,gene_list))

        hid = str(hash(''.join(gene_list)))
        session_d = self.sessions.get(hid)
        session_d['hid'] = hid
        session_d['gene_list'] = gene_list
        session_d['orig_gene_list'] = orig_gene_list
        session_d['gene_name_map'] = self.gene_name_map

        self.gene_list = gene_list
        self.orig_gene_list = orig_gene_list

        self.sessions.save(hid,session_d)

        # now that we've set everything, redirect to the processing page
        raise cherrypy.HTTPRedirect("http://fraenkel.mit.edu/adipo_sight/processing?hid=%s"%hid,status=303)
Example #2
0
    def run(self) :

        session_d = self.session_d

        if session_d is None :
            log.error('hid passed but no session found, aborting')
            return 'Error occurred, please try submitting again'

        gene_list = session_d['gene_list']

        d = {}

        # get motif scores
        cherrypy.log('getting motif scores')

        # sqlite has a limitation of 999 SQL variables, need to split up
        # this query
        batch_size = 500
        gene_list_batches = [gene_list[i:i+500] for i in xrange(0,len(gene_list),500)]
        cherrypy.log('gene_list_batches len()s: %s'%str([len(b) for b in gene_list_batches]))
        scores = []
        for batch in gene_list_batches :
            score_q = (self.db_session.query(db.Region,db.SeqData)
                               .join(db.RegionMembership)
                               .join(db.RegionSet)
                               .join(db.SeqData)
                               .filter(db.SeqData.seq_type.has(db.SeqType.name=='motif scores'))
                               .filter(db.RegionSet.name.in_(batch))
                               .filter(db.RegionMembership.dist_to_feature.between(
                                        -int(self.args['upstream']),
                                        int(self.args['downstream'])
                                      ))
                     )
            scores.extend(score_q.all())

        score_mat = []
        gene_names = set()
        condition_scores = defaultdict(list)
        cherrypy.log('loading motif scores')
        for region, seqdata in scores :
            for region_membership in region.membership :
                region_set = region_membership.region_set
                gene_names.add(region_set.name)
            condition = seqdata.condition.name
            motif_scores = cPickle.loads(seqdata.value)
            condition_scores[condition].append(motif_scores)
            score_mat.append(motif_scores)
        cherrypy.log('found conditions: %s'%condition_scores.keys())
        score_mat = np.array(score_mat)

        session_d['found'] = gene_names
        session_d['missing'] = [g for g in gene_list if g not in gene_names]

        # compare motif scores of requested genes to all hypersensitive regions
        # in the dataset

        cherrypy.log('loading motif background')
        # pick n random background sequences, but seed so the same indices are picked for every
        # different # of input DHS regions
        random.seed("jo mama")

        # walk through the conditions and compute scores
        sig_scores = defaultdict(dict)
        hs_regions = {}
        enriched_motifs = {}
        for c, scores in condition_scores.items() :

            score_mat = np.array(scores)

            hs_regions[c] = len(scores)

            # get the background out for this condition
            motif_bg_fn = resource_filename('adipo_sight','data/%s_hypersensitive_peaks_bg_motif_scores.npy'%c)

            all_scores = np.load(motif_bg_fn).T
            bg_inds = random.sample(xrange(all_scores.shape[0]),min(score_mat.shape[0],all_scores.shape[0]))
            all_scores = all_scores[bg_inds,:]

            cherrypy.log('score_mat.shape = %s'%str(score_mat.shape))
            cherrypy.log('all_scores.shape = %s'%str(all_scores.shape))
            cherrypy.log('done loading motif background')

            # calculate MWW
            pvals = mww_multiprocess(score_mat.T,all_scores.T,True)
            log.debug('motif scores for condition: %s'%c)
            log.debug('pvals: %s'%str(pvals.shape))

            motif_name_fn = resource_filename('adipo_sight','data/motif_names.txt')
            motif_names = np.array(open(motif_name_fn).readlines())

            motif_cluster_fn = resource_filename('adipo_sight','data/motif_clusters.txt')
            motif_cluster_map = dict((i,int(m)) for i,m in enumerate(open(motif_cluster_fn)))

            thresh = pvals < self.args.get('diff_hyp_pval')
            thresh_inds = np.where(thresh)[0]
            thresh_names, thresh_pvals = motif_names[thresh], pvals[thresh]
            thresh_imgs = np.array(['images/motif_logos/%03d_motif.png'%i for i in thresh_inds])
            cluster_set = set()
            for i,n,p in zip(thresh_inds,thresh_names,thresh_pvals) :
                cluster_i = motif_cluster_map[i]
                sig_scores[cluster_i].setdefault('name',set()).add(n.strip())
                sig_scores[cluster_i][c] = min(sig_scores[cluster_i].get(c,1.),p)
                cluster_set.add(cluster_i)

            enriched_motifs[c] = len(cluster_set)

        d['motifs'] = dict(sig_scores)
        d['hs_regions'] = hs_regions
        d['enriched_motifs'] = enriched_motifs

        session_d['motif_enrich'] = d