def get_symbol_values_pairs(self):
        id_list = [self.data[symbol].data_id for symbol in self.data]

        print("id_list:", id_list)

        symbol_values_dict = {}
        
        query = """SELECT TissueProbeSetXRef.Symbol, TissueProbeSetData.value
                   FROM TissueProbeSetXRef, TissueProbeSetData
                   WHERE TissueProbeSetData.Id IN {} and
                         TissueProbeSetXRef.DataId = TissueProbeSetData.Id""".format(db_tools.create_in_clause(id_list))
        
        print("TISSUE QUERY:", query)
        
        results = g.db.execute(query).fetchall()
        for result in results:
            if result.Symbol.lower() not in symbol_values_dict:
                symbol_values_dict[result.Symbol.lower()] = [result.value]
            else:
                symbol_values_dict[result.Symbol.lower()].append(result.value)

        #for symbol in self.data:
        #    data_id = self.data[symbol].data_id
        #    symbol_values_dict[symbol] = self.get_tissue_values(data_id)
        
    
        return symbol_values_dict
Example #2
0
    def get_symbol_values_pairs(self):
        id_list = [self.data[symbol].data_id for symbol in self.data]

        symbol_values_dict = {}

        if len(id_list) > 0:
            query = """SELECT TissueProbeSetXRef.Symbol, TissueProbeSetData.value
                       FROM TissueProbeSetXRef, TissueProbeSetData
                       WHERE TissueProbeSetData.Id IN {} and
                             TissueProbeSetXRef.DataId = TissueProbeSetData.Id""".format(
                db_tools.create_in_clause(id_list))

            results = g.db.execute(query).fetchall()
            for result in results:
                if result.Symbol.lower() not in symbol_values_dict:
                    symbol_values_dict[result.Symbol.lower()] = [result.value]
                else:
                    symbol_values_dict[result.Symbol.lower()].append(
                        result.value)

        #for symbol in self.data:
        #    data_id = self.data[symbol].data_id
        #    symbol_values_dict[symbol] = self.get_tissue_values(data_id)

        return symbol_values_dict
    def __init__(self, gene_symbols=None):
        self.gene_symbols = gene_symbols
        self.have_data = False
        if self.gene_symbols == None:
            self.gene_symbols = []
        
        print("self.gene_symbols:", self.gene_symbols)
        
        self.data = collections.defaultdict(Bunch)
            
        #self.gene_id_dict ={}
        #self.data_id_dict = {}
        #self.chr_dict = {}
        #self.mb_dict = {}
        #self.desc_dict = {}
        #self.probe_target_desc_dict = {}
        
        query =  '''select t.Symbol, t.GeneId, t.DataId, t.Chr, t.Mb, t.description, t.Probe_Target_Description
                        from (
                        select Symbol, max(Mean) as maxmean
                        from TissueProbeSetXRef
                        where TissueProbeSetFreezeId=1 and '''
        
        # Note that inner join is necessary in this query to get distinct record in one symbol group
        # with highest mean value
        # Due to the limit size of TissueProbeSetFreezeId table in DB,
        # performance of inner join is acceptable.MrnaAssayTissueData(gene_symbols=symbol_list)
        if len(gene_symbols) == 0:
            query +=  '''Symbol!='' and Symbol Is Not Null group by Symbol)
                as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol
                and t.Mean = x.maxmean;  
                    '''
        else:
            in_clause = db_tools.create_in_clause(gene_symbols)
            
            #ZS: This was in the query, not sure why: http://docs.python.org/2/library/string.html?highlight=lower#string.lower

            query += ''' Symbol in {} group by Symbol)
                as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol
                and t.Mean = x.maxmean;
                    '''.format(in_clause)

        results = g.db.execute(query).fetchall()
        
        for result in results:
            symbol = result[0]
            if symbol.lower() in [gene_symbol.lower() for gene_symbol in gene_symbols]:
                #gene_symbols.append(symbol)
                symbol = symbol.lower()
                
                self.data[symbol].gene_id = result.GeneId
                self.data[symbol].data_id = result.DataId
                self.data[symbol].chr = result.Chr
                self.data[symbol].mb = result.Mb
                self.data[symbol].description = result.description
                self.data[symbol].probe_target_description = result.Probe_Target_Description

        print("self.data: ", pf(self.data))
Example #4
0
    def __init__(self, gene_symbols=None):
        self.gene_symbols = gene_symbols
        if self.gene_symbols == None:
            self.gene_symbols = []

        self.data = collections.defaultdict(Bunch)

        query = '''select t.Symbol, t.GeneId, t.DataId, t.Chr, t.Mb, t.description, t.Probe_Target_Description
                        from (
                        select Symbol, max(Mean) as maxmean
                        from TissueProbeSetXRef
                        where TissueProbeSetFreezeId=1 and '''

        # Note that inner join is necessary in this query to get distinct record in one symbol group
        # with highest mean value
        # Due to the limit size of TissueProbeSetFreezeId table in DB,
        # performance of inner join is acceptable.MrnaAssayTissueData(gene_symbols=symbol_list)
        if len(gene_symbols) == 0:
            query += '''Symbol!='' and Symbol Is Not Null group by Symbol)
                as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol
                and t.Mean = x.maxmean;
                    '''
        else:
            in_clause = db_tools.create_in_clause(gene_symbols)

            # ZS: This was in the query, not sure why: http://docs.python.org/2/library/string.html?highlight=lower#string.lower
            query += ''' Symbol in {} group by Symbol)
                as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol
                and t.Mean = x.maxmean;
                    '''.format(in_clause)

        # lower_symbols = []
        lower_symbols = {}
        for gene_symbol in gene_symbols:
            # lower_symbols[gene_symbol.lower()] = True
            if gene_symbol != None:
                lower_symbols[gene_symbol.lower()] = True
        results = list(g.db.execute(query).fetchall())
        for result in results:
            symbol = result[0]
            if symbol is not None and lower_symbols.get(symbol.lower()):

                symbol = symbol.lower()

                self.data[symbol].gene_id = result.GeneId
                self.data[symbol].data_id = result.DataId
                self.data[symbol].chr = result.Chr
                self.data[symbol].mb = result.Mb
                self.data[symbol].description = result.description
                self.data[
                    symbol].probe_target_description = result.Probe_Target_Description
Example #5
0
    def get_trait_data(self, sample_list=None):
        if sample_list:
            self.samplelist = sample_list
        else:
            self.samplelist = self.group.samplelist

        if self.group.parlist != None and self.group.f1list != None:
            if (self.group.parlist + self.group.f1list) in self.samplelist:
                self.samplelist += self.group.parlist + self.group.f1list

        query = """
            SELECT Strain.Name, Strain.Id FROM Strain, Species
            WHERE Strain.Name IN {}
            and Strain.SpeciesId=Species.Id
            and Species.name = '{}'
            """.format(create_in_clause(self.samplelist),
                       *mescape(self.group.species))
        logger.sql(query)
        results = dict(g.db.execute(query).fetchall())
        sample_ids = [results[item] for item in self.samplelist]

        # MySQL limits the number of tables that can be used in a join to 61,
        # so we break the sample ids into smaller chunks
        # Postgres doesn't have that limit, so we can get rid of this after we transition
        chunk_size = 50
        number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
        trait_sample_data = []
        for sample_ids_step in chunks.divide_into_chunks(
                sample_ids, number_chunks):
            if self.type == "Publish":
                dataset_type = "Phenotype"
            else:
                dataset_type = self.type
            temp = ['T%s.value' % item for item in sample_ids_step]
            if self.type == "Publish":
                query = "SELECT {}XRef.Id,".format(escape(self.type))
            else:
                query = "SELECT {}.Name,".format(escape(dataset_type))
            data_start_pos = 1
            query += ', '.join(temp)
            query += ' FROM ({}, {}XRef, {}Freeze) '.format(
                *mescape(dataset_type, self.type, self.type))

            for item in sample_ids_step:
                query += """
                        left join {}Data as T{} on T{}.Id = {}XRef.DataId
                        and T{}.StrainId={}\n
                        """.format(
                    *mescape(self.type, item, item, self.type, item, item))

            if self.type == "Publish":
                query += """
                        WHERE {}XRef.InbredSetId = {}Freeze.InbredSetId
                        and {}Freeze.Name = '{}'
                        and {}.Id = {}XRef.{}Id
                        order by {}.Id
                        """.format(*mescape(self.type, self.type, self.type,
                                            self.name, dataset_type, self.type,
                                            dataset_type, dataset_type))
            else:
                query += """
                        WHERE {}XRef.{}FreezeId = {}Freeze.Id
                        and {}Freeze.Name = '{}'
                        and {}.Id = {}XRef.{}Id
                        order by {}.Id
                        """.format(*mescape(
                    self.type, self.type, self.type, self.type, self.name,
                    dataset_type, self.type, self.type, dataset_type))

            #logger.debug("trait data query: ", query)

            logger.sql(query)
            results = g.db.execute(query).fetchall()
            #logger.debug("query results:", results)
            trait_sample_data.append(results)

        trait_count = len(trait_sample_data[0])
        self.trait_data = collections.defaultdict(list)

        # put all of the separate data together into a dictionary where the keys are
        # trait names and values are lists of sample values
        for trait_counter in range(trait_count):
            trait_name = trait_sample_data[0][trait_counter][0]
            for chunk_counter in range(int(number_chunks)):
                self.trait_data[trait_name] += (
                    trait_sample_data[chunk_counter][trait_counter]
                    [data_start_pos:])