def get_symbol_values_pairs(self): id_list = [self.data[symbol].data_id for symbol in self.data] print("id_list:", id_list) symbol_values_dict = {} query = """SELECT TissueProbeSetXRef.Symbol, TissueProbeSetData.value FROM TissueProbeSetXRef, TissueProbeSetData WHERE TissueProbeSetData.Id IN {} and TissueProbeSetXRef.DataId = TissueProbeSetData.Id""".format(db_tools.create_in_clause(id_list)) print("TISSUE QUERY:", query) results = g.db.execute(query).fetchall() for result in results: if result.Symbol.lower() not in symbol_values_dict: symbol_values_dict[result.Symbol.lower()] = [result.value] else: symbol_values_dict[result.Symbol.lower()].append(result.value) #for symbol in self.data: # data_id = self.data[symbol].data_id # symbol_values_dict[symbol] = self.get_tissue_values(data_id) return symbol_values_dict
def get_symbol_values_pairs(self): id_list = [self.data[symbol].data_id for symbol in self.data] symbol_values_dict = {} if len(id_list) > 0: query = """SELECT TissueProbeSetXRef.Symbol, TissueProbeSetData.value FROM TissueProbeSetXRef, TissueProbeSetData WHERE TissueProbeSetData.Id IN {} and TissueProbeSetXRef.DataId = TissueProbeSetData.Id""".format( db_tools.create_in_clause(id_list)) results = g.db.execute(query).fetchall() for result in results: if result.Symbol.lower() not in symbol_values_dict: symbol_values_dict[result.Symbol.lower()] = [result.value] else: symbol_values_dict[result.Symbol.lower()].append( result.value) #for symbol in self.data: # data_id = self.data[symbol].data_id # symbol_values_dict[symbol] = self.get_tissue_values(data_id) return symbol_values_dict
def __init__(self, gene_symbols=None): self.gene_symbols = gene_symbols self.have_data = False if self.gene_symbols == None: self.gene_symbols = [] print("self.gene_symbols:", self.gene_symbols) self.data = collections.defaultdict(Bunch) #self.gene_id_dict ={} #self.data_id_dict = {} #self.chr_dict = {} #self.mb_dict = {} #self.desc_dict = {} #self.probe_target_desc_dict = {} query = '''select t.Symbol, t.GeneId, t.DataId, t.Chr, t.Mb, t.description, t.Probe_Target_Description from ( select Symbol, max(Mean) as maxmean from TissueProbeSetXRef where TissueProbeSetFreezeId=1 and ''' # Note that inner join is necessary in this query to get distinct record in one symbol group # with highest mean value # Due to the limit size of TissueProbeSetFreezeId table in DB, # performance of inner join is acceptable.MrnaAssayTissueData(gene_symbols=symbol_list) if len(gene_symbols) == 0: query += '''Symbol!='' and Symbol Is Not Null group by Symbol) as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean; ''' else: in_clause = db_tools.create_in_clause(gene_symbols) #ZS: This was in the query, not sure why: http://docs.python.org/2/library/string.html?highlight=lower#string.lower query += ''' Symbol in {} group by Symbol) as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean; '''.format(in_clause) results = g.db.execute(query).fetchall() for result in results: symbol = result[0] if symbol.lower() in [gene_symbol.lower() for gene_symbol in gene_symbols]: #gene_symbols.append(symbol) symbol = symbol.lower() self.data[symbol].gene_id = result.GeneId self.data[symbol].data_id = result.DataId self.data[symbol].chr = result.Chr self.data[symbol].mb = result.Mb self.data[symbol].description = result.description self.data[symbol].probe_target_description = result.Probe_Target_Description print("self.data: ", pf(self.data))
def __init__(self, gene_symbols=None): self.gene_symbols = gene_symbols if self.gene_symbols == None: self.gene_symbols = [] self.data = collections.defaultdict(Bunch) query = '''select t.Symbol, t.GeneId, t.DataId, t.Chr, t.Mb, t.description, t.Probe_Target_Description from ( select Symbol, max(Mean) as maxmean from TissueProbeSetXRef where TissueProbeSetFreezeId=1 and ''' # Note that inner join is necessary in this query to get distinct record in one symbol group # with highest mean value # Due to the limit size of TissueProbeSetFreezeId table in DB, # performance of inner join is acceptable.MrnaAssayTissueData(gene_symbols=symbol_list) if len(gene_symbols) == 0: query += '''Symbol!='' and Symbol Is Not Null group by Symbol) as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean; ''' else: in_clause = db_tools.create_in_clause(gene_symbols) # ZS: This was in the query, not sure why: http://docs.python.org/2/library/string.html?highlight=lower#string.lower query += ''' Symbol in {} group by Symbol) as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean; '''.format(in_clause) # lower_symbols = [] lower_symbols = {} for gene_symbol in gene_symbols: # lower_symbols[gene_symbol.lower()] = True if gene_symbol != None: lower_symbols[gene_symbol.lower()] = True results = list(g.db.execute(query).fetchall()) for result in results: symbol = result[0] if symbol is not None and lower_symbols.get(symbol.lower()): symbol = symbol.lower() self.data[symbol].gene_id = result.GeneId self.data[symbol].data_id = result.DataId self.data[symbol].chr = result.Chr self.data[symbol].mb = result.Mb self.data[symbol].description = result.description self.data[ symbol].probe_target_description = result.Probe_Target_Description
def get_trait_data(self, sample_list=None): if sample_list: self.samplelist = sample_list else: self.samplelist = self.group.samplelist if self.group.parlist != None and self.group.f1list != None: if (self.group.parlist + self.group.f1list) in self.samplelist: self.samplelist += self.group.parlist + self.group.f1list query = """ SELECT Strain.Name, Strain.Id FROM Strain, Species WHERE Strain.Name IN {} and Strain.SpeciesId=Species.Id and Species.name = '{}' """.format(create_in_clause(self.samplelist), *mescape(self.group.species)) logger.sql(query) results = dict(g.db.execute(query).fetchall()) sample_ids = [results[item] for item in self.samplelist] # MySQL limits the number of tables that can be used in a join to 61, # so we break the sample ids into smaller chunks # Postgres doesn't have that limit, so we can get rid of this after we transition chunk_size = 50 number_chunks = int(math.ceil(len(sample_ids) / chunk_size)) trait_sample_data = [] for sample_ids_step in chunks.divide_into_chunks( sample_ids, number_chunks): if self.type == "Publish": dataset_type = "Phenotype" else: dataset_type = self.type temp = ['T%s.value' % item for item in sample_ids_step] if self.type == "Publish": query = "SELECT {}XRef.Id,".format(escape(self.type)) else: query = "SELECT {}.Name,".format(escape(dataset_type)) data_start_pos = 1 query += ', '.join(temp) query += ' FROM ({}, {}XRef, {}Freeze) '.format( *mescape(dataset_type, self.type, self.type)) for item in sample_ids_step: query += """ left join {}Data as T{} on T{}.Id = {}XRef.DataId and T{}.StrainId={}\n """.format( *mescape(self.type, item, item, self.type, item, item)) if self.type == "Publish": query += """ WHERE {}XRef.InbredSetId = {}Freeze.InbredSetId and {}Freeze.Name = '{}' and {}.Id = {}XRef.{}Id order by {}.Id """.format(*mescape(self.type, self.type, self.type, self.name, dataset_type, self.type, dataset_type, dataset_type)) else: query += """ WHERE {}XRef.{}FreezeId = {}Freeze.Id and {}Freeze.Name = '{}' and {}.Id = {}XRef.{}Id order by {}.Id """.format(*mescape( self.type, self.type, self.type, self.type, self.name, dataset_type, self.type, self.type, dataset_type)) #logger.debug("trait data query: ", query) logger.sql(query) results = g.db.execute(query).fetchall() #logger.debug("query results:", results) trait_sample_data.append(results) trait_count = len(trait_sample_data[0]) self.trait_data = collections.defaultdict(list) # put all of the separate data together into a dictionary where the keys are # trait names and values are lists of sample values for trait_counter in range(trait_count): trait_name = trait_sample_data[0][trait_counter][0] for chunk_counter in range(int(number_chunks)): self.trait_data[trait_name] += ( trait_sample_data[chunk_counter][trait_counter] [data_start_pos:])