def gen_search_result(self): """ Get the info displayed in the search result table from the set of results computed in the "search" function """ self.trait_list = [] species = webqtlDatabaseFunction.retrieve_species(self.dataset.group.name) # result_set represents the results for each search term; a search of # "shh grin2b" would have two sets of results, one for each term print("self.results is:", pf(self.results)) for result in self.results: if not result: continue #### Excel file needs to be generated #### print("foo locals are:", locals()) trait_id = result[0] this_trait = GeneralTrait(dataset=self.dataset, name=trait_id) this_trait.retrieve_info(get_qtl_info=True) self.trait_list.append(this_trait) self.dataset.get_trait_info(self.trait_list, species)
def gen_covariates_file(this_dataset, covariates): covariate_list = covariates.split(",") covariate_data_object = [] for covariate in covariate_list: this_covariate_data = [] trait_name = covariate.split(":")[0] dataset_ob = create_dataset(covariate.split(":")[1]) trait_ob = GeneralTrait(dataset=dataset_ob, name=trait_name, cellid=None) #trait_samples = this_dataset.group.all_samples_ordered() this_dataset.group.get_samplelist() trait_samples = this_dataset.group.samplelist logger.debug("SAMPLES:", trait_samples) trait_sample_data = trait_ob.data logger.debug("SAMPLE DATA:", trait_sample_data) for index, sample in enumerate(trait_samples): if sample in trait_sample_data: sample_value = trait_sample_data[sample].value this_covariate_data.append(sample_value) else: this_covariate_data.append("-9") covariate_data_object.append(this_covariate_data) with open( "{}/{}_covariates.txt".format(flat_files('mapping'), this_dataset.group.name), "w") as outfile: for i in range(len(covariate_data_object[0])): for this_covariate in covariate_data_object: outfile.write(str(this_covariate[i]) + "\t") outfile.write("\n")
def gen_search_result(self): """ Get the info displayed in the search result table from the set of results computed in the "search" function """ self.trait_list = [] species = webqtlDatabaseFunction.retrieve_species( self.dataset.group.name) # result_set represents the results for each search term; a search of # "shh grin2b" would have two sets of results, one for each term print("self.results is:", pf(self.results)) for result in self.results: if not result: continue #### Excel file needs to be generated #### #print("foo locals are:", locals()) trait_id = result[0] this_trait = GeneralTrait(dataset=self.dataset, name=trait_id, get_qtl_info=True, get_sample_info=False) self.trait_list.append(this_trait) self.dataset.get_trait_info(self.trait_list, species)
def get_trait_db_obs(self, trait_db_list): self.trait_list = [] for i, trait_db in enumerate(trait_db_list): if i == (len(trait_db_list) - 1): break trait_name, dataset_name = trait_db.split(":") dataset_ob = data_set.create_dataset(dataset_name) trait_ob = GeneralTrait(dataset=dataset_ob, name=trait_name, cellid=None) self.trait_list.append((trait_ob, dataset_ob))
def do_correlation(start_vars): assert ('db' in start_vars) assert ('target_db' in start_vars) assert ('trait_id' in start_vars) this_dataset = data_set.create_dataset(dataset_name=start_vars['db']) target_dataset = data_set.create_dataset( dataset_name=start_vars['target_db']) this_trait = GeneralTrait(dataset=this_dataset, name=start_vars['trait_id']) this_trait = retrieve_sample_data(this_trait, this_dataset) corr_params = init_corr_params(start_vars) corr_results = calculate_results(this_trait, this_dataset, target_dataset, corr_params) #corr_results = collections.OrderedDict(sorted(corr_results.items(), key=lambda t: -abs(t[1][0]))) final_results = [] for _trait_counter, trait in enumerate( corr_results.keys()[:corr_params['return_count']]): if corr_params['type'] == "tissue": [sample_r, num_overlap, sample_p, symbol] = corr_results[trait] result_dict = { "trait": trait, "sample_r": sample_r, "#_strains": num_overlap, "p_value": sample_p, "symbol": symbol } elif corr_params['type'] == "literature" or corr_params[ 'type'] == "lit": [gene_id, sample_r] = corr_results[trait] result_dict = { "trait": trait, "sample_r": sample_r, "gene_id": gene_id } else: [sample_r, sample_p, num_overlap] = corr_results[trait] result_dict = { "trait": trait, "sample_r": sample_r, "#_strains": num_overlap, "p_value": sample_p } final_results.append(result_dict) # json_corr_results = generate_corr_json(final_corr_results, this_trait, this_dataset, target_dataset, for_api = True) return final_results
def add_cofactors(cross, this_dataset, covariates, samples): ro.numpy2ri.activate() covariate_list = covariates.split(",") covar_name_string = "c(" for i, covariate in enumerate(covariate_list): this_covar_data = [] covar_as_string = "c(" trait_name = covariate.split(":")[0] dataset_ob = create_dataset(covariate.split(":")[1]) trait_ob = GeneralTrait(dataset=dataset_ob, name=trait_name, cellid=None) this_dataset.group.get_samplelist() trait_samples = this_dataset.group.samplelist trait_sample_data = trait_ob.data for index, sample in enumerate(trait_samples): if sample in samples: if sample in trait_sample_data: sample_value = trait_sample_data[sample].value this_covar_data.append(sample_value) else: this_covar_data.append("NA") for j, item in enumerate(this_covar_data): if j < (len(this_covar_data) - 1): covar_as_string += str(item) + "," else: covar_as_string += str(item) covar_as_string += ")" col_name = "covar_" + str(i) cross = add_phenotype(cross, covar_as_string, col_name) if i < (len(covariate_list) - 1): covar_name_string += '"' + col_name + '", ' else: covar_name_string += '"' + col_name + '"' covar_name_string += ")" covars_ob = pull_var("trait_covars", cross, covar_name_string) return cross, covars_ob
def __init__(self, kw): logger.debug("in ShowTrait, kw are:", kw) if 'trait_id' in kw and kw['dataset'] != "Temp": self.temp_trait = False self.trait_id = kw['trait_id'] helper_functions.get_species_dataset_trait(self, kw) elif 'group' in kw: self.temp_trait = True self.trait_id = "Temp_" + kw['species'] + "_" + kw[ 'group'] + "_" + datetime.datetime.now().strftime("%m%d%H%M%S") self.temp_species = kw['species'] self.temp_group = kw['group'] self.dataset = data_set.create_dataset(dataset_name="Temp", dataset_type="Temp", group_name=self.temp_group) # Put values in Redis so they can be looked up later if added to a collection Redis.set(self.trait_id, kw['trait_paste']) self.trait_vals = kw['trait_paste'].split() self.this_trait = GeneralTrait(dataset=self.dataset, name=self.trait_id, cellid=None) else: self.temp_trait = True self.trait_id = kw['trait_id'] self.temp_species = self.trait_id.split("_")[1] self.temp_group = self.trait_id.split("_")[2] self.dataset = data_set.create_dataset(dataset_name="Temp", dataset_type="Temp", group_name=self.temp_group) self.this_trait = GeneralTrait(dataset=self.dataset, name=self.trait_id, cellid=None) self.trait_vals = Redis.get(self.trait_id).split() #ZS: Get verify/rna-seq link URLs try: blatsequence = self.this_trait.blatseq if not blatsequence: #XZ, 06/03/2009: ProbeSet name is not unique among platforms. We should use ProbeSet Id instead. query1 = """SELECT Probe.Sequence, Probe.Name FROM Probe, ProbeSet, ProbeSetFreeze, ProbeSetXRef WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND ProbeSetXRef.ProbeSetId = ProbeSet.Id AND ProbeSetFreeze.Name = '%s' AND ProbeSet.Name = '%s' AND Probe.ProbeSetId = ProbeSet.Id order by Probe.SerialOrder""" % ( self.this_trait.dataset.name, self.this_trait.name) seqs = g.db.execute(query1).fetchall() if not seqs: raise ValueError else: blatsequence = '' for seqt in seqs: if int(seqt[1][-1]) % 2 == 1: blatsequence += string.strip(seqt[0]) #--------Hongqiang add this part in order to not only blat ProbeSet, but also blat Probe blatsequence = '%3E' + self.this_trait.name + '%0A' + blatsequence + '%0A' #XZ, 06/03/2009: ProbeSet name is not unique among platforms. We should use ProbeSet Id instead. query2 = """SELECT Probe.Sequence, Probe.Name FROM Probe, ProbeSet, ProbeSetFreeze, ProbeSetXRef WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND ProbeSetXRef.ProbeSetId = ProbeSet.Id AND ProbeSetFreeze.Name = '%s' AND ProbeSet.Name = '%s' AND Probe.ProbeSetId = ProbeSet.Id order by Probe.SerialOrder""" % ( self.this_trait.dataset.name, self.this_trait.name) seqs = g.db.execute(query2).fetchall() for seqt in seqs: if int(seqt[1][-1]) % 2 == 1: blatsequence += '%3EProbe_' + string.strip( seqt[1]) + '%0A' + string.strip(seqt[0]) + '%0A' if self.dataset.group.species == "rat": self.UCSC_BLAT_URL = webqtlConfig.UCSC_BLAT % ('rat', 'rn6', blatsequence) self.UTHSC_BLAT_URL = "" elif self.dataset.group.species == "mouse": self.UCSC_BLAT_URL = webqtlConfig.UCSC_BLAT % ('mouse', 'mm10', blatsequence) self.UTHSC_BLAT_URL = webqtlConfig.UTHSC_BLAT % ( 'mouse', 'mm10', blatsequence) elif self.dataset.group.species == "human": self.UCSC_BLAT_URL = webqtlConfig.UCSC_BLAT % ('human', 'hg38', blatsequence) self.UTHSC_BLAT_URL = "" else: self.UCSC_BLAT_URL = "" self.UTHSC_BLAT_URL = "" except: self.UCSC_BLAT_URL = "" self.UTHSC_BLAT_URL = "" if self.dataset.type == "ProbeSet": self.show_probes = "True" trait_units = get_trait_units(self.this_trait) self.get_external_links() self.build_correlation_tools() self.ncbi_summary = get_ncbi_summary(self.this_trait) #Get nearest marker for composite mapping if not self.temp_trait: if check_if_attr_exists( self.this_trait, 'locus_chr' ) and self.dataset.type != "Geno" and self.dataset.type != "Publish": self.nearest_marker = get_nearest_marker( self.this_trait, self.dataset) #self.nearest_marker1 = get_nearest_marker(self.this_trait, self.dataset)[0] #self.nearest_marker2 = get_nearest_marker(self.this_trait, self.dataset)[1] else: self.nearest_marker = "" #self.nearest_marker1 = "" #self.nearest_marker2 = "" self.make_sample_lists() self.qnorm_vals = quantile_normalize_vals(self.sample_groups) self.z_scores = get_z_scores(self.sample_groups) self.temp_uuid = uuid.uuid4() self.sample_group_types = OrderedDict() if len(self.sample_groups) > 1: self.sample_group_types[ 'samples_primary'] = self.dataset.group.name self.sample_group_types['samples_other'] = "Other" self.sample_group_types['samples_all'] = "All" else: self.sample_group_types[ 'samples_primary'] = self.dataset.group.name sample_lists = [group.sample_list for group in self.sample_groups] categorical_var_list = [] if not self.temp_trait: categorical_var_list = get_categorical_variables( self.this_trait, self.sample_groups[0] ) #ZS: Only using first samplelist, since I think mapping only uses those samples #ZS: Get list of chromosomes to select for mapping self.chr_list = [["All", -1]] for i, this_chr in enumerate( self.dataset.species.chromosomes.chromosomes): self.chr_list.append([ self.dataset.species.chromosomes.chromosomes[this_chr].name, i ]) self.genofiles = self.dataset.group.get_genofiles() self.has_num_cases = has_num_cases(self.this_trait) self.stats_table_width, self.trait_table_width = get_table_widths( self.sample_groups, self.has_num_cases) #ZS: Needed to know whether to display bar chart + get max sample name length in order to set table column width self.num_values = 0 self.binary = "true" #ZS: So it knows whether to display the Binary R/qtl mapping method, which doesn't work unless all values are 0 or 1 self.negative_vals_exist = "false" #ZS: Since we don't want to show log2 transform option for situations where it doesn't make sense max_samplename_width = 1 for group in self.sample_groups: for sample in group.sample_list: if len(sample.name) > max_samplename_width: max_samplename_width = len(sample.name) if sample.display_value != "x": self.num_values += 1 if sample.display_value != 0 or sample.display_value != 1: self.binary = "false" if sample.value < 0: self.negative_vals_exist = "true" sample_column_width = max_samplename_width * 8 if self.num_values >= 5000: self.maf = 0.01 else: self.maf = 0.05 trait_symbol = None short_description = None if not self.temp_trait: if self.this_trait.symbol: trait_symbol = self.this_trait.symbol short_description = trait_symbol elif hasattr(self.this_trait, 'post_publication_abbreviation'): short_description = self.this_trait.post_publication_abbreviation elif hasattr(self.this_trait, 'pre_publication_abbreviation'): short_description = self.this_trait.pre_publication_abbreviation # Todo: Add back in the ones we actually need from below, as we discover we need them hddn = OrderedDict() if self.dataset.group.allsamples: hddn['allsamples'] = string.join(self.dataset.group.allsamples, ' ') hddn['primary_samples'] = string.join(self.primary_sample_names, ',') hddn['trait_id'] = self.trait_id hddn['trait_display_name'] = self.this_trait.display_name hddn['dataset'] = self.dataset.name hddn['temp_trait'] = False if self.temp_trait: hddn['temp_trait'] = True hddn['group'] = self.temp_group hddn['species'] = self.temp_species hddn['use_outliers'] = False hddn['method'] = "gemma" hddn['selected_chr'] = -1 hddn['mapping_display_all'] = True hddn['suggestive'] = 0 hddn['num_perm'] = 0 hddn['categorical_vars'] = "" hddn['manhattan_plot'] = "" hddn['control_marker'] = "" if not self.temp_trait: if hasattr( self.this_trait, 'locus_chr' ) and self.this_trait.locus_chr != "" and self.dataset.type != "Geno" and self.dataset.type != "Publish": hddn['control_marker'] = self.nearest_marker #hddn['control_marker'] = self.nearest_marker1+","+self.nearest_marker2 hddn['do_control'] = False hddn['maf'] = 0.05 hddn['compare_traits'] = [] hddn['export_data'] = "" hddn['export_format'] = "excel" # We'll need access to this_trait and hddn in the Jinja2 Template, so we put it inside self self.hddn = hddn js_data = dict(trait_id=self.trait_id, trait_symbol=trait_symbol, short_description=short_description, unit_type=trait_units, dataset_type=self.dataset.type, data_scale=self.dataset.data_scale, sample_group_types=self.sample_group_types, sample_lists=sample_lists, attribute_names=self.sample_groups[0].attributes, categorical_vars=",".join(categorical_var_list), num_values=self.num_values, qnorm_values=self.qnorm_vals, zscore_values=self.z_scores, sample_column_width=sample_column_width, temp_uuid=self.temp_uuid) self.js_data = js_data
def __init__(self, kw): self.type = kw['type'] self.terms = kw['terms'] if self.type == "gene": sql = """ SELECT Species.`Name` AS species_name, InbredSet.`Name` AS inbredset_name, Tissue.`Name` AS tissue_name, ProbeSetFreeze.Name AS probesetfreeze_name, ProbeSet.Name AS probeset_name, ProbeSet.Symbol AS probeset_symbol, ProbeSet.`description` AS probeset_description, ProbeSet.Chr AS chr, ProbeSet.Mb AS mb, ProbeSetXRef.Mean AS mean, ProbeSetXRef.LRS AS lrs, ProbeSetXRef.`Locus` AS locus, ProbeSetXRef.`pValue` AS pvalue, ProbeSetXRef.`additive` AS additive FROM Species, InbredSet, ProbeSetXRef, ProbeSet, ProbeFreeze, ProbeSetFreeze, Tissue WHERE InbredSet.`SpeciesId`=Species.`Id` AND ProbeFreeze.InbredSetId=InbredSet.`Id` AND ProbeFreeze.`TissueId`=Tissue.`Id` AND ProbeSetFreeze.ProbeFreezeId=ProbeFreeze.Id AND ( MATCH (ProbeSet.Name,ProbeSet.description,ProbeSet.symbol,alias,GenbankId, UniGeneId, Probe_Target_Description) AGAINST ('%s' IN BOOLEAN MODE) ) AND ProbeSet.Id = ProbeSetXRef.ProbeSetId AND ProbeSetXRef.ProbeSetFreezeId=ProbeSetFreeze.Id AND ProbeSetFreeze.confidentiality < 1 AND ProbeSetFreeze.public > 0 ORDER BY species_name, inbredset_name, tissue_name, probesetfreeze_name, probeset_name LIMIT 6000 """ % (self.terms) with Bench("Running query"): logger.sql(sql) re = g.db.execute(sql).fetchall() self.trait_list = [] with Bench("Creating trait objects"): for line in re: dataset = create_dataset(line[3], "ProbeSet", get_samplelist=False) trait_id = line[4] #with Bench("Building trait object"): this_trait = GeneralTrait(dataset=dataset, name=trait_id, get_qtl_info=True, get_sample_info=False) self.trait_list.append(this_trait) elif self.type == "phenotype": sql = """ SELECT Species.`Name`, InbredSet.`Name`, PublishFreeze.`Name`, PublishXRef.`Id`, Phenotype.`Post_publication_description`, Publication.`Authors`, Publication.`Year`, PublishXRef.`LRS`, PublishXRef.`Locus`, PublishXRef.`additive` FROM Species,InbredSet,PublishFreeze,PublishXRef,Phenotype,Publication WHERE PublishXRef.`InbredSetId`=InbredSet.`Id` AND PublishFreeze.`InbredSetId`=InbredSet.`Id` AND InbredSet.`SpeciesId`=Species.`Id` AND PublishXRef.`PhenotypeId`=Phenotype.`Id` AND PublishXRef.`PublicationId`=Publication.`Id` AND (Phenotype.Post_publication_description REGEXP "[[:<:]]%s[[:>:]]" OR Phenotype.Pre_publication_description REGEXP "[[:<:]]%s[[:>:]]" OR Phenotype.Pre_publication_abbreviation REGEXP "[[:<:]]%s[[:>:]]" OR Phenotype.Post_publication_abbreviation REGEXP "[[:<:]]%s[[:>:]]" OR Phenotype.Lab_code REGEXP "[[:<:]]%s[[:>:]]" OR Publication.PubMed_ID REGEXP "[[:<:]]%s[[:>:]]" OR Publication.Abstract REGEXP "[[:<:]]%s[[:>:]]" OR Publication.Title REGEXP "[[:<:]]%s[[:>:]]" OR Publication.Authors REGEXP "[[:<:]]%s[[:>:]]" OR PublishXRef.Id REGEXP "[[:<:]]%s[[:>:]]") ORDER BY Species.`Name`, InbredSet.`Name`, PublishXRef.`Id` LIMIT 6000 """ % (self.terms, self.terms, self.terms, self.terms, self.terms, self.terms, self.terms, self.terms, self.terms, self.terms) logger.sql(sql) re = g.db.execute(sql).fetchall() self.trait_list = [] with Bench("Creating trait objects"): for line in re: dataset = create_dataset(line[2], "Publish") trait_id = line[3] this_trait = GeneralTrait(dataset=dataset, name=trait_id, get_qtl_info=True, get_sample_info=False) self.trait_list.append(this_trait)
def __init__(self, params): self.data_set_1 = data_set.create_dataset(params['dataset_1']) self.data_set_2 = data_set.create_dataset(params['dataset_2']) #self.data_set_3 = data_set.create_dataset(params['dataset_3']) self.trait_1 = GeneralTrait(name=params['trait_1'], dataset=self.data_set_1) self.trait_2 = GeneralTrait(name=params['trait_2'], dataset=self.data_set_2) #self.trait_3 = GeneralTrait(name=params['trait_3'], dataset=self.data_set_3) samples_1, samples_2, num_overlap = corr_result_helpers.normalize_values_with_samples( self.trait_1.data, self.trait_2.data) self.data = [] self.indIDs = samples_1.keys() vals_1 = [] for sample in samples_1.keys(): vals_1.append(samples_1[sample].value) self.data.append(vals_1) vals_2 = [] for sample in samples_2.keys(): vals_2.append(samples_2[sample].value) self.data.append(vals_2) slope, intercept, r_value, p_value, std_err = stats.linregress( vals_1, vals_2) if slope < 0.001: slope_string = '%.3E' % slope else: slope_string = '%.3f' % slope x_buffer = (max(vals_1) - min(vals_1)) * 0.1 y_buffer = (max(vals_2) - min(vals_2)) * 0.1 x_range = [min(vals_1) - x_buffer, max(vals_1) + x_buffer] y_range = [min(vals_2) - y_buffer, max(vals_2) + y_buffer] intercept_coords = get_intercept_coords(slope, intercept, x_range, y_range) rx = stats.rankdata(vals_1) ry = stats.rankdata(vals_2) self.rdata = [] self.rdata.append(rx.tolist()) self.rdata.append(ry.tolist()) srslope, srintercept, srr_value, srp_value, srstd_err = stats.linregress( rx, ry) if srslope < 0.001: srslope_string = '%.3E' % srslope else: srslope_string = '%.3f' % srslope x_buffer = (max(rx) - min(rx)) * 0.1 y_buffer = (max(ry) - min(ry)) * 0.1 sr_range = [min(rx) - x_buffer, max(rx) + x_buffer] sr_intercept_coords = get_intercept_coords(srslope, srintercept, sr_range, sr_range) self.collections_exist = "False" if g.user_session.num_collections > 0: self.collections_exist = "True" self.js_data = dict( data=self.data, rdata=self.rdata, indIDs=self.indIDs, trait_1=self.trait_1.dataset.name + ": " + str(self.trait_1.name), trait_2=self.trait_2.dataset.name + ": " + str(self.trait_2.name), samples_1=samples_1, samples_2=samples_2, num_overlap=num_overlap, vals_1=vals_1, vals_2=vals_2, x_range=x_range, y_range=y_range, sr_range=sr_range, intercept_coords=intercept_coords, sr_intercept_coords=sr_intercept_coords, slope=slope, slope_string=slope_string, intercept=intercept, r_value=r_value, p_value=p_value, srslope=srslope, srslope_string=srslope_string, srintercept=srintercept, srr_value=srr_value, srp_value=srp_value #trait3 = self.trait_3.data, #vals_3 = vals_3 ) self.jsdata = self.js_data
def __init__(self, start_vars): # get trait list from db (database name) # calculate correlation with Base vector and targets # Check parameters assert ('corr_type' in start_vars) assert (is_str(start_vars['corr_type'])) assert ('dataset' in start_vars) # assert('group' in start_vars) permitted to be empty? assert ('corr_sample_method' in start_vars) assert ('corr_samples_group' in start_vars) assert ('corr_dataset' in start_vars) #assert('min_expr' in start_vars) assert ('corr_return_results' in start_vars) if 'loc_chr' in start_vars: assert ('min_loc_mb' in start_vars) assert ('max_loc_mb' in start_vars) with Bench("Doing correlations"): if start_vars['dataset'] == "Temp": self.dataset = data_set.create_dataset( dataset_name="Temp", dataset_type="Temp", group_name=start_vars['group']) self.trait_id = start_vars['trait_id'] self.this_trait = GeneralTrait(dataset=self.dataset, name=self.trait_id, cellid=None) else: helper_functions.get_species_dataset_trait(self, start_vars) #self.dataset.group.read_genotype_file() corr_samples_group = start_vars['corr_samples_group'] self.sample_data = {} self.corr_type = start_vars['corr_type'] self.corr_method = start_vars['corr_sample_method'] self.min_expr = get_float(start_vars, 'min_expr') self.p_range_lower = get_float(start_vars, 'p_range_lower', -1.0) self.p_range_upper = get_float(start_vars, 'p_range_upper', 1.0) if ('loc_chr' in start_vars and 'min_loc_mb' in start_vars and 'max_loc_mb' in start_vars): self.location_chr = get_string(start_vars, 'loc_chr') self.min_location_mb = get_int(start_vars, 'min_loc_mb') self.max_location_mb = get_int(start_vars, 'max_loc_mb') else: self.location_chr = self.min_location_mb = self.max_location_mb = None self.get_formatted_corr_type() self.return_number = int(start_vars['corr_return_results']) #The two if statements below append samples to the sample list based upon whether the user #rselected Primary Samples Only, Other Samples Only, or All Samples primary_samples = self.dataset.group.samplelist if self.dataset.group.parlist != None: primary_samples += self.dataset.group.parlist if self.dataset.group.f1list != None: primary_samples += self.dataset.group.f1list #If either BXD/whatever Only or All Samples, append all of that group's samplelist if corr_samples_group != 'samples_other': self.process_samples(start_vars, primary_samples) #If either Non-BXD/whatever or All Samples, get all samples from this_trait.data and #exclude the primary samples (because they would have been added in the previous #if statement if the user selected All Samples) if corr_samples_group != 'samples_primary': if corr_samples_group == 'samples_other': primary_samples = [ x for x in primary_samples if x not in (self.dataset.group.parlist + self.dataset.group.f1list) ] self.process_samples(start_vars, self.this_trait.data.keys(), primary_samples) self.target_dataset = data_set.create_dataset( start_vars['corr_dataset']) self.target_dataset.get_trait_data(self.sample_data.keys()) self.header_fields = get_header_fields(self.target_dataset.type, self.corr_method) if self.target_dataset.type == "ProbeSet": self.filter_cols = [7, 6] elif self.target_dataset.type == "Publish": self.filter_cols = [6, 0] else: self.filter_cols = [4, 0] self.correlation_results = [] self.correlation_data = {} if self.corr_type == "tissue": self.trait_symbol_dict = self.dataset.retrieve_genes("Symbol") tissue_corr_data = self.do_tissue_correlation_for_all_traits() if tissue_corr_data != None: for trait in tissue_corr_data.keys()[:self.return_number]: self.get_sample_r_and_p_values( trait, self.target_dataset.trait_data[trait]) else: for trait, values in self.target_dataset.trait_data.iteritems( ): self.get_sample_r_and_p_values(trait, values) elif self.corr_type == "lit": self.trait_geneid_dict = self.dataset.retrieve_genes("GeneId") lit_corr_data = self.do_lit_correlation_for_all_traits() for trait in lit_corr_data.keys()[:self.return_number]: self.get_sample_r_and_p_values( trait, self.target_dataset.trait_data[trait]) elif self.corr_type == "sample": for trait, values in self.target_dataset.trait_data.iteritems( ): self.get_sample_r_and_p_values(trait, values) self.correlation_data = collections.OrderedDict( sorted(self.correlation_data.items(), key=lambda t: -abs(t[1][0]))) if self.target_dataset.type == "ProbeSet" or self.target_dataset.type == "Geno": #ZS: Convert min/max chromosome to an int for the location range option range_chr_as_int = None for order_id, chr_info in self.dataset.species.chromosomes.chromosomes.iteritems( ): if 'loc_chr' in start_vars: if chr_info.name == self.location_chr: range_chr_as_int = order_id for _trait_counter, trait in enumerate( self.correlation_data.keys()[:self.return_number]): trait_object = GeneralTrait(dataset=self.target_dataset, name=trait, get_qtl_info=True, get_sample_info=False) if self.target_dataset.type == "ProbeSet" or self.target_dataset.type == "Geno": #ZS: Convert trait chromosome to an int for the location range option chr_as_int = 0 for order_id, chr_info in self.dataset.species.chromosomes.chromosomes.iteritems( ): if chr_info.name == trait_object.chr: chr_as_int = order_id if (float( self.correlation_data[trait][0]) >= self.p_range_lower and float(self.correlation_data[trait][0]) <= self.p_range_upper): if self.target_dataset.type == "ProbeSet" or self.target_dataset.type == "Geno": if (self.min_expr != None) and (float( trait_object.mean) < self.min_expr): continue elif range_chr_as_int != None and (chr_as_int != range_chr_as_int): continue elif (self.min_location_mb != None) and (float( trait_object.mb) < float( self.min_location_mb)): continue elif (self.max_location_mb != None) and (float( trait_object.mb) > float( self.max_location_mb)): continue (trait_object.sample_r, trait_object.sample_p, trait_object.num_overlap) = self.correlation_data[trait] # Set some sane defaults trait_object.tissue_corr = 0 trait_object.tissue_pvalue = 0 trait_object.lit_corr = 0 if self.corr_type == "tissue" and tissue_corr_data != None: trait_object.tissue_corr = tissue_corr_data[trait][1] trait_object.tissue_pvalue = tissue_corr_data[trait][2] elif self.corr_type == "lit": trait_object.lit_corr = lit_corr_data[trait][1] self.correlation_results.append(trait_object) self.target_dataset.get_trait_info( self.correlation_results, self.target_dataset.group.species) if self.corr_type != "lit" and self.dataset.type == "ProbeSet" and self.target_dataset.type == "ProbeSet": self.do_lit_correlation_for_trait_list() if self.corr_type != "tissue" and self.dataset.type == "ProbeSet" and self.target_dataset.type == "ProbeSet": self.do_tissue_correlation_for_trait_list() self.json_results = generate_corr_json(self.correlation_results, self.this_trait, self.dataset, self.target_dataset)
def do_mapping_for_api(start_vars): assert ('db' in start_vars) assert ('trait_id' in start_vars) dataset = data_set.create_dataset(dataset_name=start_vars['db']) dataset.group.get_markers() this_trait = GeneralTrait(dataset=dataset, name=start_vars['trait_id']) this_trait = retrieve_sample_data(this_trait, dataset) samples = [] vals = [] for sample in dataset.group.samplelist: in_trait_data = False for item in this_trait.data: if this_trait.data[item].name == sample: value = str(this_trait.data[item].value) samples.append(item) vals.append(value) in_trait_data = True break if not in_trait_data: vals.append("x") mapping_params = initialize_parameters(start_vars, dataset, this_trait) covariates = "" #ZS: It seems to take an empty string as default. This should probably be changed. if mapping_params['mapping_method'] == "gemma": header_row = ["name", "chr", "Mb", "lod_score", "p_value"] if mapping_params[ 'use_loco'] == "True": #ZS: gemma_mapping returns both results and the filename for LOCO, so need to only grab the former for api result_markers = gemma_mapping.run_gemma( this_trait, dataset, samples, vals, covariates, mapping_params['use_loco'], mapping_params['maf'])[0] else: result_markers = gemma_mapping.run_gemma( this_trait, dataset, samples, vals, covariates, mapping_params['use_loco'], mapping_params['maf']) elif mapping_params['mapping_method'] == "rqtl": header_row = ["name", "chr", "cM", "lod_score"] if mapping_params['num_perm'] > 0: _sperm_output, _suggestive, _significant, result_markers = rqtl_mapping.run_rqtl_geno( vals, dataset, mapping_params['rqtl_method'], mapping_params['rqtl_model'], mapping_params['perm_check'], mapping_params['num_perm'], mapping_params['do_control'], mapping_params['control_marker'], mapping_params['manhattan_plot'], mapping_params['pair_scan']) else: result_markers = rqtl_mapping.run_rqtl_geno( vals, dataset, mapping_params['rqtl_method'], mapping_params['rqtl_model'], mapping_params['perm_check'], mapping_params['num_perm'], mapping_params['do_control'], mapping_params['control_marker'], mapping_params['manhattan_plot'], mapping_params['pair_scan']) if mapping_params['limit_to']: result_markers = result_markers[:mapping_params['limit_to']] if mapping_params['format'] == "csv": output_rows = [] output_rows.append(header_row) for marker in result_markers: this_row = [marker[header] for header in header_row] output_rows.append(this_row) return output_rows, mapping_params['format'] elif mapping_params['format'] == "json": return result_markers, mapping_params['format'] else: return result_markers, None
def __init__(self, kw): logger.debug("in ShowTrait, kw are:", kw) if 'trait_id' in kw and kw['dataset'] != "Temp": self.temp_trait = False self.trait_id = kw['trait_id'] helper_functions.get_species_dataset_trait(self, kw) elif 'group' in kw: self.temp_trait = True self.trait_id = "Temp_"+kw['species']+ "_" + kw['group'] + "_" + datetime.datetime.now().strftime("%m%d%H%M%S") self.temp_species = kw['species'] self.temp_group = kw['group'] self.dataset = data_set.create_dataset(dataset_name = "Temp", dataset_type = "Temp", group_name = self.temp_group) self.this_trait = GeneralTrait(dataset=self.dataset, name=self.trait_id, cellid=None) self.trait_vals = kw['trait_paste'].split() # Put values in Redis so they can be looked up later if added to a collection Redis.set(self.trait_id, kw['trait_paste']) else: self.temp_trait = True self.trait_id = kw['trait_id'] self.temp_species = self.trait_id.split("_")[1] self.temp_group = self.trait_id.split("_")[2] self.dataset = data_set.create_dataset(dataset_name = "Temp", dataset_type = "Temp", group_name = self.temp_group) self.this_trait = GeneralTrait(dataset=self.dataset, name=self.trait_id, cellid=None) self.trait_vals = Redis.get(self.trait_id).split() #self.dataset.group.read_genotype_file() #if this_trait: # if this_trait.dataset and this_trait.dataset.type and this_trait.dataset.type == 'ProbeSet': # self.cursor.execute("SELECT h2 from ProbeSetXRef WHERE DataId = %d" % # this_trait.mysqlid) # heritability = self.cursor.fetchone() #ZS: Get verify/rna-seq link URLs try: blatsequence = self.this_trait.blatseq if not blatsequence: #XZ, 06/03/2009: ProbeSet name is not unique among platforms. We should use ProbeSet Id instead. query1 = """SELECT Probe.Sequence, Probe.Name FROM Probe, ProbeSet, ProbeSetFreeze, ProbeSetXRef WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND ProbeSetXRef.ProbeSetId = ProbeSet.Id AND ProbeSetFreeze.Name = '%s' AND ProbeSet.Name = '%s' AND Probe.ProbeSetId = ProbeSet.Id order by Probe.SerialOrder""" % (self.this_trait.dataset.name, self.this_trait.name) seqs = g.db.execute(query1).fetchall() if not seqs: raise ValueError else: blatsequence = '' for seqt in seqs: if int(seqt[1][-1]) % 2 == 1: blatsequence += string.strip(seqt[0]) #--------Hongqiang add this part in order to not only blat ProbeSet, but also blat Probe blatsequence = '%3E' + self.this_trait.name + '%0A' + blatsequence + '%0A' #XZ, 06/03/2009: ProbeSet name is not unique among platforms. We should use ProbeSet Id instead. query2 = """SELECT Probe.Sequence, Probe.Name FROM Probe, ProbeSet, ProbeSetFreeze, ProbeSetXRef WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND ProbeSetXRef.ProbeSetId = ProbeSet.Id AND ProbeSetFreeze.Name = '%s' AND ProbeSet.Name = '%s' AND Probe.ProbeSetId = ProbeSet.Id order by Probe.SerialOrder""" % (self.this_trait.dataset.name, self.this_trait.name) seqs = g.db.execute(query2).fetchall() for seqt in seqs: if int(seqt[1][-1]) %2 == 1: blatsequence += '%3EProbe_' + string.strip(seqt[1]) + '%0A' + string.strip(seqt[0]) + '%0A' if self.dataset.group.species == "rat": self.UCSC_BLAT_URL = webqtlConfig.UCSC_BLAT % ('rat', 'rn3', blatsequence) self.UTHSC_BLAT_URL = "" elif self.dataset.group.species == "mouse": self.UCSC_BLAT_URL = webqtlConfig.UTHSC_BLAT2 % ('mouse', 'mm10', blatsequence) self.UTHSC_BLAT_URL = webqtlConfig.UTHSC_BLAT % ('mouse', 'mm10', blatsequence) elif self.dataset.group.species == "human": self.UCSC_BLAT_URL = webqtlConfig.UTHSC_BLAT2 % ('human', 'hg19', blatsequence) self.UTHSC_BLAT_URL = "" else: self.UCSC_BLAT_URL = "" self.UTHSC_BLAT_URL = "" except: self.UCSC_BLAT_URL = "" self.UTHSC_BLAT_URL = "" self.build_correlation_tools() #Get nearest marker for composite mapping if not self.temp_trait: if hasattr(self.this_trait, 'locus_chr') and self.this_trait.locus_chr != "" and self.dataset.type != "Geno" and self.dataset.type != "Publish": self.nearest_marker = get_nearest_marker(self.this_trait, self.dataset) #self.nearest_marker1 = get_nearest_marker(self.this_trait, self.dataset)[0] #self.nearest_marker2 = get_nearest_marker(self.this_trait, self.dataset)[1] else: self.nearest_marker = "" #self.nearest_marker1 = "" #self.nearest_marker2 = "" self.make_sample_lists() # Todo: Add back in the ones we actually need from below, as we discover we need them hddn = OrderedDict() if self.dataset.group.allsamples: hddn['allsamples'] = string.join(self.dataset.group.allsamples, ' ') hddn['trait_id'] = self.trait_id hddn['dataset'] = self.dataset.name hddn['temp_trait'] = False if self.temp_trait: hddn['temp_trait'] = True hddn['group'] = self.temp_group hddn['species'] = self.temp_species hddn['use_outliers'] = False hddn['method'] = "pylmm" hddn['mapping_display_all'] = True hddn['suggestive'] = 0 hddn['num_perm'] = 0 hddn['manhattan_plot'] = "" hddn['control_marker'] = "" if not self.temp_trait: if hasattr(self.this_trait, 'locus_chr') and self.this_trait.locus_chr != "" and self.dataset.type != "Geno" and self.dataset.type != "Publish": hddn['control_marker'] = self.nearest_marker #hddn['control_marker'] = self.nearest_marker1+","+self.nearest_marker2 hddn['do_control'] = False hddn['maf'] = 0.01 hddn['compare_traits'] = [] hddn['export_data'] = "" # We'll need access to this_trait and hddn in the Jinja2 Template, so we put it inside self self.hddn = hddn self.temp_uuid = uuid.uuid4() self.sample_group_types = OrderedDict() if len(self.sample_groups) > 1: self.sample_group_types['samples_primary'] = self.dataset.group.name self.sample_group_types['samples_other'] = "Other" self.sample_group_types['samples_all'] = "All" else: self.sample_group_types['samples_primary'] = self.dataset.group.name sample_lists = [group.sample_list for group in self.sample_groups] self.get_mapping_methods() self.stats_table_width, self.trait_table_width = get_table_widths(self.sample_groups) trait_symbol = None if not self.temp_trait: if self.this_trait.symbol: trait_symbol = self.this_trait.symbol js_data = dict(trait_id = self.trait_id, trait_symbol = trait_symbol, dataset_type = self.dataset.type, data_scale = self.dataset.data_scale, sample_group_types = self.sample_group_types, sample_lists = sample_lists, attribute_names = self.sample_groups[0].attributes, temp_uuid = self.temp_uuid) self.js_data = js_data
def run_analysis(self, requestform): logger.info("Starting PheWAS analysis on dataset") genofilelocation = locate( "BXD.geno", "genotype") # Get the location of the BXD genotypes precompfile = locate_phewas( "PheWAS_pval_EMMA_norm.RData", "auwerx") # Get the location of the pre-computed EMMA results # Get user parameters, trait_id and dataset, and store/update them in self self.trait_id = requestform["trait_id"] self.datasetname = requestform["dataset"] self.dataset = data_set.create_dataset(self.datasetname) self.region = int(requestform["num_region"]) self.mtadjust = str(requestform["sel_mtadjust"]) # Logger.Info some debug logger.info("self.trait_id:" + self.trait_id + "\n") logger.info("self.datasetname:" + self.datasetname + "\n") logger.info("self.dataset.type:" + self.dataset.type + "\n") # GN Magic ? self.this_trait = GeneralTrait(dataset=self.dataset, name=self.trait_id, get_qtl_info=False, get_sample_info=False) logger.info(vars(self.this_trait)) # Set the values we need self.chr = str(self.this_trait.chr) self.mb = int(self.this_trait.mb) # logger.info some debug logger.info("location:" + self.chr + ":" + str(self.mb) + "+/-" + str(self.region) + "\n") # Load in the genotypes file *sigh* to make the markermap parser = genofile_parser.ConvertGenoFile(genofilelocation) parser.process_csv() snpinfo = [] for marker in parser.markers: snpinfo.append(marker["name"]) snpinfo.append(marker["chr"]) snpinfo.append(marker["Mb"]) rnames = r_seq(1, len(parser.markers)) # Create the snp aligner object out of the BXD genotypes snpaligner = ro.r.matrix(snpinfo, nrow=len(parser.markers), dimnames=r_list(rnames, r_c("SNP", "Chr", "Pos")), ncol=3, byrow=True) # Create the phenotype aligner object using R phenoaligner = self.r_create_Pheno_aligner() self.results = {} self.results['imgurl1'] = webqtlUtil.genRandStr("phewas_") + ".png" self.results['imgloc1'] = GENERATED_IMAGE_DIR + self.results['imgurl1'] self.results['mtadjust'] = self.mtadjust logger.info("IMAGE AT:", self.results['imgurl1']) logger.info("IMAGE AT:", self.results['imgloc1']) # Create the PheWAS plot (The gene/probe name, chromosome and gene/probe positions should come from the user input) # TODO: generate the PDF in the temp folder, with a unique name assert (precompfile) assert (phenoaligner) assert (snpaligner) phewasres = self.r_PheWASManhattan("Test", precompfile, phenoaligner, snpaligner, "None", self.chr, self.mb, self.region, self.results['imgloc1'], self.mtadjust) self.results['phewas1'] = phewasres[0] self.results['phewas2'] = phewasres[1] self.results['tabulardata'] = phewasres[2] self.results['R_debuglog'] = phewasres[3] #self.r_PheWASManhattan(allpvalues) #self.r_Stop() logger.info("Initialization of PheWAS done !")
def __init__(self, kw): assert ('type' in kw) assert ('terms' in kw) self.type = kw['type'] self.terms = kw['terms'] assert (is_str(self.type)) if self.type == "gene": sql = """ SELECT Species.`Name` AS species_name, InbredSet.`Name` AS inbredset_name, Tissue.`Name` AS tissue_name, ProbeSetFreeze.Name AS probesetfreeze_name, ProbeSetFreeze.FullName AS probesetfreeze_fullname, ProbeSet.Name AS probeset_name, ProbeSet.Symbol AS probeset_symbol, ProbeSet.`description` AS probeset_description, ProbeSet.Chr AS chr, ProbeSet.Mb AS mb, ProbeSetXRef.Mean AS mean, ProbeSetXRef.LRS AS lrs, ProbeSetXRef.`Locus` AS locus, ProbeSetXRef.`pValue` AS pvalue, ProbeSetXRef.`additive` AS additive FROM Species, InbredSet, ProbeSetXRef, ProbeSet, ProbeFreeze, ProbeSetFreeze, Tissue WHERE InbredSet.`SpeciesId`=Species.`Id` AND ProbeFreeze.InbredSetId=InbredSet.`Id` AND ProbeFreeze.`TissueId`=Tissue.`Id` AND ProbeSetFreeze.ProbeFreezeId=ProbeFreeze.Id AND ( MATCH (ProbeSet.Name,ProbeSet.description,ProbeSet.symbol,alias,GenbankId, UniGeneId, Probe_Target_Description) AGAINST ('%s' IN BOOLEAN MODE) ) AND ProbeSet.Id = ProbeSetXRef.ProbeSetId AND ProbeSetXRef.ProbeSetFreezeId=ProbeSetFreeze.Id AND ProbeSetFreeze.confidentiality < 1 AND ProbeSetFreeze.public > 0 ORDER BY species_name, inbredset_name, tissue_name, probesetfreeze_name, probeset_name LIMIT 6000 """ % (self.terms) with Bench("Running query"): logger.sql(sql) re = g.db.execute(sql).fetchall() trait_list = [] with Bench("Creating trait objects"): for i, line in enumerate(re): this_trait = {} this_trait['index'] = i + 1 this_trait['name'] = line[5] this_trait['dataset'] = line[3] this_trait['dataset_fullname'] = line[4] this_trait['hmac'] = hmac.data_hmac('{}:{}'.format( line[5], line[3])) this_trait['species'] = line[0] this_trait['group'] = line[1] this_trait['tissue'] = line[2] this_trait['symbol'] = line[6] this_trait['description'] = line[7].decode( 'utf-8', 'replace') this_trait['location_repr'] = 'N/A' if (line[8] != "NULL" and line[8] != "") and (line[9] != 0): this_trait['location_repr'] = 'Chr%s: %.6f' % ( line[8], float(line[9])) try: this_trait['mean'] = '%.3f' % line[10] except: this_trait['mean'] = "N/A" this_trait['LRS_score_repr'] = "N/A" if line[11] != "" and line[11] != None: this_trait['LRS_score_repr'] = '%3.1f' % line[11] this_trait['additive'] = "N/A" if line[14] != "" and line[14] != None: this_trait['additive'] = '%.3f' % line[14] #dataset = create_dataset(line[3], "ProbeSet", get_samplelist=False) #trait_id = line[4] #with Bench("Building trait object"): trait_ob = GeneralTrait(dataset_name=this_trait['dataset'], name=this_trait['name'], get_qtl_info=True, get_sample_info=False) max_lrs_text = "N/A" if trait_ob.locus_chr != "" and trait_ob.locus_mb != "": max_lrs_text = "Chr" + str( trait_ob.locus_chr) + ": " + str(trait_ob.locus_mb) this_trait['max_lrs_text'] = max_lrs_text trait_list.append(this_trait) self.trait_count = len(trait_list) self.trait_list = json.dumps(trait_list) self.header_fields = [ 'Index', 'Record', 'Species', 'Group', 'Tissue', 'Dataset', 'Symbol', 'Description', 'Location', 'Mean', 'Max LRS', 'Max LRS Location', 'Additive Effect' ] elif self.type == "phenotype": search_term = self.terms group_clause = "" if "_" in self.terms: if len(self.terms.split("_")[0]) == 3: search_term = self.terms.split("_")[1] group_clause = "AND InbredSet.`InbredSetCode` = '{}'".format( self.terms.split("_")[0]) sql = """ SELECT Species.`Name`, InbredSet.`Name`, PublishFreeze.`Name`, PublishFreeze.`FullName`, PublishXRef.`Id`, Phenotype.`Pre_publication_description`, Phenotype.`Post_publication_description`, Publication.`Authors`, Publication.`Year`, Publication.`PubMed_ID`, PublishXRef.`LRS`, PublishXRef.`additive`, InbredSet.`InbredSetCode`, PublishXRef.`mean` FROM Species,InbredSet,PublishFreeze,PublishXRef,Phenotype,Publication WHERE PublishXRef.`InbredSetId`=InbredSet.`Id` AND PublishFreeze.`InbredSetId`=InbredSet.`Id` AND InbredSet.`SpeciesId`=Species.`Id` {0} AND PublishXRef.`PhenotypeId`=Phenotype.`Id` AND PublishXRef.`PublicationId`=Publication.`Id` AND (Phenotype.Post_publication_description REGEXP "[[:<:]]{1}[[:>:]]" OR Phenotype.Pre_publication_description REGEXP "[[:<:]]{1}[[:>:]]" OR Phenotype.Pre_publication_abbreviation REGEXP "[[:<:]]{1}[[:>:]]" OR Phenotype.Post_publication_abbreviation REGEXP "[[:<:]]{1}[[:>:]]" OR Phenotype.Lab_code REGEXP "[[:<:]]{1}[[:>:]]" OR Publication.PubMed_ID REGEXP "[[:<:]]{1}[[:>:]]" OR Publication.Abstract REGEXP "[[:<:]]{1}[[:>:]]" OR Publication.Title REGEXP "[[:<:]]{1}[[:>:]]" OR Publication.Authors REGEXP "[[:<:]]{1}[[:>:]]" OR PublishXRef.Id REGEXP "[[:<:]]{1}[[:>:]]") ORDER BY Species.`Name`, InbredSet.`Name`, PublishXRef.`Id` LIMIT 6000 """.format(group_clause, search_term) logger.sql(sql) re = g.db.execute(sql).fetchall() trait_list = [] with Bench("Creating trait objects"): for i, line in enumerate(re): this_trait = {} this_trait['index'] = i + 1 this_trait['name'] = str(line[4]) if len(str(line[12])) == 3: this_trait['display_name'] = str( line[12]) + "_" + this_trait['name'] else: this_trait['display_name'] = this_trait['name'] this_trait['dataset'] = line[2] this_trait['dataset_fullname'] = line[3] this_trait['hmac'] = hmac.data_hmac('{}:{}'.format( line[4], line[2])) this_trait['species'] = line[0] this_trait['group'] = line[1] if line[9] != None and line[6] != None: this_trait['description'] = line[6].decode( 'utf-8', 'replace') elif line[5] != None: this_trait['description'] = line[5].decode( 'utf-8', 'replace') else: this_trait['description'] = "N/A" if line[13] != None and line[13] != "": this_trait['mean'] = line[13] else: this_trait['mean'] = "N/A" this_trait['authors'] = line[7] this_trait['year'] = line[8] if this_trait['year'].isdigit(): this_trait['pubmed_text'] = this_trait['year'] else: this_trait['pubmed_text'] = "N/A" if line[9] != "" and line[9] != None: this_trait[ 'pubmed_link'] = webqtlConfig.PUBMEDLINK_URL % line[ 8] else: this_trait['pubmed_link'] = "N/A" if line[12]: this_trait['display_name'] = line[12] + "_" + str( this_trait['name']) this_trait['LRS_score_repr'] = "N/A" if line[10] != "" and line[10] != None: this_trait['LRS_score_repr'] = '%3.1f' % line[10] this_trait['additive'] = "N/A" if line[11] != "" and line[11] != None: this_trait['additive'] = '%.3f' % line[11] #dataset = create_dataset(line[2], "Publish") #trait_id = line[3] #this_trait = GeneralTrait(dataset=dataset, name=trait_id, get_qtl_info=True, get_sample_info=False) this_trait['max_lrs_text'] = "N/A" if this_trait[ 'dataset'] == this_trait['group'] + "Publish": try: trait_ob = GeneralTrait( dataset_name=this_trait['dataset'], name=this_trait['name'], get_qtl_info=True, get_sample_info=False) if trait_ob.locus_chr != "" and trait_ob.locus_mb != "": this_trait['max_lrs_text'] = "Chr" + str( trait_ob.locus_chr) + ": " + str( trait_ob.locus_mb) except: this_trait['max_lrs_text'] = "N/A" trait_list.append(this_trait) self.trait_count = len(trait_list) self.trait_list = json.dumps(trait_list) self.header_fields = [ 'Index', 'Species', 'Group', 'Record', 'Description', 'Authors', 'Year', 'Max LRS', 'Max LRS Location', 'Additive Effect' ]
def __init__(self, kw): logger.debug("in ShowTrait, kw are:", kw) if 'trait_id' in kw and kw['dataset'] != "Temp": self.temp_trait = False self.trait_id = kw['trait_id'] helper_functions.get_species_dataset_trait(self, kw) elif 'group' in kw: self.temp_trait = True self.trait_id = "Temp_" + kw['species'] + "_" + kw[ 'group'] + "_" + datetime.datetime.now().strftime("%m%d%H%M%S") self.temp_species = kw['species'] self.temp_group = kw['group'] self.dataset = data_set.create_dataset(dataset_name="Temp", dataset_type="Temp", group_name=self.temp_group) self.this_trait = GeneralTrait(dataset=self.dataset, name=self.trait_id, cellid=None) self.trait_vals = kw['trait_paste'].split() # Put values in Redis so they can be looked up later if added to a collection Redis.set(self.trait_id, kw['trait_paste']) else: self.temp_trait = True self.trait_id = kw['trait_id'] self.temp_species = self.trait_id.split("_")[1] self.temp_group = self.trait_id.split("_")[2] self.dataset = data_set.create_dataset(dataset_name="Temp", dataset_type="Temp", group_name=self.temp_group) self.this_trait = GeneralTrait(dataset=self.dataset, name=self.trait_id, cellid=None) self.trait_vals = Redis.get(self.trait_id).split() #self.dataset.group.read_genotype_file() #if this_trait: # if this_trait.dataset and this_trait.dataset.type and this_trait.dataset.type == 'ProbeSet': # self.cursor.execute("SELECT h2 from ProbeSetXRef WHERE DataId = %d" % # this_trait.mysqlid) # heritability = self.cursor.fetchone() self.build_correlation_tools() #Get nearest marker for composite mapping if not self.temp_trait: if hasattr( self.this_trait, 'locus_chr' ) and self.this_trait.locus_chr != "" and self.dataset.type != "Geno" and self.dataset.type != "Publish": self.nearest_marker = get_nearest_marker( self.this_trait, self.dataset) #self.nearest_marker1 = get_nearest_marker(self.this_trait, self.dataset)[0] #self.nearest_marker2 = get_nearest_marker(self.this_trait, self.dataset)[1] else: self.nearest_marker = "" #self.nearest_marker1 = "" #self.nearest_marker2 = "" self.make_sample_lists() # Todo: Add back in the ones we actually need from below, as we discover we need them hddn = OrderedDict() if self.dataset.group.allsamples: hddn['allsamples'] = string.join(self.dataset.group.allsamples, ' ') hddn['trait_id'] = self.trait_id hddn['dataset'] = self.dataset.name hddn['temp_trait'] = False if self.temp_trait: hddn['temp_trait'] = True hddn['group'] = self.temp_group hddn['species'] = self.temp_species hddn['use_outliers'] = False hddn['method'] = "pylmm" hddn['mapping_display_all'] = True hddn['suggestive'] = 0 hddn['num_perm'] = 0 hddn['manhattan_plot'] = "" hddn['control_marker'] = "" if not self.temp_trait: if hasattr( self.this_trait, 'locus_chr' ) and self.this_trait.locus_chr != "" and self.dataset.type != "Geno" and self.dataset.type != "Publish": hddn['control_marker'] = self.nearest_marker #hddn['control_marker'] = self.nearest_marker1+","+self.nearest_marker2 hddn['do_control'] = False hddn['maf'] = 0.01 hddn['compare_traits'] = [] hddn['export_data'] = "" # We'll need access to this_trait and hddn in the Jinja2 Template, so we put it inside self self.hddn = hddn self.temp_uuid = uuid.uuid4() self.sample_group_types = OrderedDict() if len(self.sample_groups) > 1: self.sample_group_types[ 'samples_primary'] = self.dataset.group.name + " Only" self.sample_group_types[ 'samples_other'] = "Non-" + self.dataset.group.name self.sample_group_types['samples_all'] = "All Cases" else: self.sample_group_types[ 'samples_primary'] = self.dataset.group.name sample_lists = [group.sample_list for group in self.sample_groups] self.get_mapping_methods() self.trait_table_width = get_trait_table_width(self.sample_groups) trait_symbol = None if not self.temp_trait: if self.this_trait.symbol: trait_symbol = self.this_trait.symbol js_data = dict(trait_id=self.trait_id, trait_symbol=trait_symbol, dataset_type=self.dataset.type, data_scale=self.dataset.data_scale, sample_group_types=self.sample_group_types, sample_lists=sample_lists, attribute_names=self.sample_groups[0].attributes, temp_uuid=self.temp_uuid) self.js_data = js_data
def __init__(self, params): self.data_set_1 = data_set.create_dataset(params['dataset_1']) self.data_set_2 = data_set.create_dataset(params['dataset_2']) self.trait_1 = GeneralTrait(name=params['trait_1'], dataset=self.data_set_1) self.trait_2 = GeneralTrait(name=params['trait_2'], dataset=self.data_set_2) try: width = int(params['width']) except: width = 800 self.width = width try: height = int(params['height']) except: height = 600 self.height = height try: circle_color = params['circle_color'] except: circle_color = '#3D85C6' self.circle_color = circle_color try: circle_radius = int(params['circle_radius']) except: circle_radius = 5 self.circle_radius = circle_radius try: line_color = params['line_color'] except: line_color = '#FF0000' self.line_color = line_color try: line_width = int(params['line_width']) except: line_width = 1 self.line_width = line_width samples_1, samples_2, num_overlap = corr_result_helpers.normalize_values_with_samples(self.trait_1.data, self.trait_2.data) self.data = [] self.indIDs = samples_1.keys() vals_1 = [] for sample in samples_1.keys(): vals_1.append(samples_1[sample].value) self.data.append(vals_1) vals_2 = [] for sample in samples_2.keys(): vals_2.append(samples_2[sample].value) self.data.append(vals_2) x = np.array(vals_1) y = np.array(vals_2) slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) rx = stats.rankdata(x) ry = stats.rankdata(y) self.rdata = [] self.rdata.append(rx.tolist()) self.rdata.append(ry.tolist()) srslope, srintercept, srr_value, srp_value, srstd_err = stats.linregress(rx, ry) self.js_data = dict( data = self.data, rdata = self.rdata, indIDs = self.indIDs, trait_1 = self.trait_1.dataset.name + ": " + str(self.trait_1.name), trait_2 = self.trait_2.dataset.name + ": " + str(self.trait_2.name), samples_1 = samples_1, samples_2 = samples_2, num_overlap = num_overlap, vals_1 = vals_1, vals_2 = vals_2, slope = slope, intercept = intercept, r_value = r_value, p_value = p_value, srslope = srslope, srintercept = srintercept, srr_value = srr_value, srp_value = srp_value, width = width, height = height, circle_color = circle_color, circle_radius = circle_radius, line_color = line_color, line_width = line_width ) self.jsdata = self.js_data
def __init__(self, params): self.data_set_1 = data_set.create_dataset(params['dataset_1']) self.data_set_2 = data_set.create_dataset(params['dataset_2']) #self.data_set_3 = data_set.create_dataset(params['dataset_3']) self.trait_1 = GeneralTrait(name=params['trait_1'], dataset=self.data_set_1) self.trait_2 = GeneralTrait(name=params['trait_2'], dataset=self.data_set_2) #self.trait_3 = GeneralTrait(name=params['trait_3'], dataset=self.data_set_3) samples_1, samples_2, num_overlap = corr_result_helpers.normalize_values_with_samples(self.trait_1.data, self.trait_2.data) self.data = [] self.indIDs = samples_1.keys() vals_1 = [] for sample in samples_1.keys(): vals_1.append(samples_1[sample].value) self.data.append(vals_1) vals_2 = [] for sample in samples_2.keys(): vals_2.append(samples_2[sample].value) self.data.append(vals_2) x = np.array(vals_1) y = np.array(vals_2) slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) rx = stats.rankdata(x) ry = stats.rankdata(y) self.rdata = [] self.rdata.append(rx.tolist()) self.rdata.append(ry.tolist()) srslope, srintercept, srr_value, srp_value, srstd_err = stats.linregress(rx, ry) #vals_3 = [] #for sample in self.trait_3.data: # vals_3.append(self.trait_3.data[sample].value) self.js_data = dict( data = self.data, rdata = self.rdata, indIDs = self.indIDs, trait_1 = self.trait_1.dataset.name + ": " + str(self.trait_1.name), trait_2 = self.trait_2.dataset.name + ": " + str(self.trait_2.name), samples_1 = samples_1, samples_2 = samples_2, num_overlap = num_overlap, vals_1 = vals_1, vals_2 = vals_2, slope = slope, intercept = intercept, r_value = r_value, p_value = p_value, srslope = srslope, srintercept = srintercept, srr_value = srr_value, srp_value = srp_value #trait3 = self.trait_3.data, #vals_3 = vals_3 ) self.jsdata = self.js_data
def __init__(self, start_vars): # get trait list from db (database name) # calculate correlation with Base vector and targets # Check parameters assert('corr_type' in start_vars) assert(is_str(start_vars['corr_type'])) assert('dataset' in start_vars) # assert('group' in start_vars) permitted to be empty? assert('corr_sample_method' in start_vars) assert('corr_samples_group' in start_vars) assert('corr_dataset' in start_vars) assert('min_expr' in start_vars) assert('corr_return_results' in start_vars) if 'loc_chr' in start_vars: assert('min_loc_mb' in start_vars) assert('max_loc_mb' in start_vars) with Bench("Doing correlations"): if start_vars['dataset'] == "Temp": self.dataset = data_set.create_dataset(dataset_name = "Temp", dataset_type = "Temp", group_name = start_vars['group']) self.trait_id = "Temp" self.this_trait = GeneralTrait(dataset=self.dataset, name=self.trait_id, cellid=None) else: helper_functions.get_species_dataset_trait(self, start_vars) self.dataset.group.read_genotype_file() corr_samples_group = start_vars['corr_samples_group'] self.sample_data = {} self.corr_type = start_vars['corr_type'] self.corr_method = start_vars['corr_sample_method'] self.min_expr = get_float(start_vars,'min_expr') self.p_range_lower = get_float(start_vars,'p_range_lower',-1.0) self.p_range_upper = get_float(start_vars,'p_range_upper',1.0) if ('loc_chr' in start_vars and 'min_loc_mb' in start_vars and 'max_loc_mb' in start_vars): self.location_chr = get_string(start_vars,'loc_chr') self.min_location_mb = get_int(start_vars,'min_loc_mb') self.max_location_mb = get_int(start_vars,'max_loc_mb') self.get_formatted_corr_type() self.return_number = int(start_vars['corr_return_results']) #The two if statements below append samples to the sample list based upon whether the user #rselected Primary Samples Only, Other Samples Only, or All Samples primary_samples = self.dataset.group.samplelist if self.dataset.group.parlist != None: primary_samples += self.dataset.group.parlist if self.dataset.group.f1list != None: primary_samples += self.dataset.group.f1list #If either BXD/whatever Only or All Samples, append all of that group's samplelist if corr_samples_group != 'samples_other': self.process_samples(start_vars, primary_samples) #If either Non-BXD/whatever or All Samples, get all samples from this_trait.data and #exclude the primary samples (because they would have been added in the previous #if statement if the user selected All Samples) if corr_samples_group != 'samples_primary': if corr_samples_group == 'samples_other': primary_samples = [x for x in primary_samples if x not in ( self.dataset.group.parlist + self.dataset.group.f1list)] self.process_samples(start_vars, self.this_trait.data.keys(), primary_samples) self.target_dataset = data_set.create_dataset(start_vars['corr_dataset']) self.target_dataset.get_trait_data(self.sample_data.keys()) self.correlation_results = [] self.correlation_data = {} if self.corr_type == "tissue": self.trait_symbol_dict = self.dataset.retrieve_genes("Symbol") tissue_corr_data = self.do_tissue_correlation_for_all_traits() if tissue_corr_data != None: for trait in tissue_corr_data.keys()[:self.return_number]: self.get_sample_r_and_p_values(trait, self.target_dataset.trait_data[trait]) else: for trait, values in self.target_dataset.trait_data.iteritems(): self.get_sample_r_and_p_values(trait, values) elif self.corr_type == "lit": self.trait_geneid_dict = self.dataset.retrieve_genes("GeneId") lit_corr_data = self.do_lit_correlation_for_all_traits() for trait in lit_corr_data.keys()[:self.return_number]: self.get_sample_r_and_p_values(trait, self.target_dataset.trait_data[trait]) elif self.corr_type == "sample": for trait, values in self.target_dataset.trait_data.iteritems(): self.get_sample_r_and_p_values(trait, values) self.correlation_data = collections.OrderedDict(sorted(self.correlation_data.items(), key=lambda t: -abs(t[1][0]))) if self.target_dataset.type == "ProbeSet" or self.target_dataset.type == "Geno": #ZS: Convert min/max chromosome to an int for the location range option range_chr_as_int = None for order_id, chr_info in self.dataset.species.chromosomes.chromosomes.iteritems(): if chr_info.name == self.location_chr: range_chr_as_int = order_id for _trait_counter, trait in enumerate(self.correlation_data.keys()[:self.return_number]): trait_object = GeneralTrait(dataset=self.target_dataset, name=trait, get_qtl_info=True, get_sample_info=False) if self.target_dataset.type == "ProbeSet" or self.target_dataset.type == "Geno": #ZS: Convert trait chromosome to an int for the location range option chr_as_int = 0 for order_id, chr_info in self.dataset.species.chromosomes.chromosomes.iteritems(): if chr_info.name == trait_object.chr: chr_as_int = order_id if (float(self.correlation_data[trait][0]) >= self.p_range_lower and float(self.correlation_data[trait][0]) <= self.p_range_upper): if self.target_dataset.type == "ProbeSet" or self.target_dataset.type == "Geno": if (self.min_expr != None) and (float(trait_object.mean) < self.min_expr): continue elif range_chr_as_int != None and (chr_as_int != range_chr_as_int): continue elif (self.min_location_mb != None) and (float(trait_object.mb) < float(self.min_location_mb)): continue elif (self.max_location_mb != None) and (float(trait_object.mb) > float(self.max_location_mb)): continue (trait_object.sample_r, trait_object.sample_p, trait_object.num_overlap) = self.correlation_data[trait] # Set some sane defaults trait_object.tissue_corr = 0 trait_object.tissue_pvalue = 0 trait_object.lit_corr = 0 if self.corr_type == "tissue" and tissue_corr_data != None: trait_object.tissue_corr = tissue_corr_data[trait][1] trait_object.tissue_pvalue = tissue_corr_data[trait][2] elif self.corr_type == "lit": trait_object.lit_corr = lit_corr_data[trait][1] self.correlation_results.append(trait_object) else: (trait_object.sample_r, trait_object.sample_p, trait_object.num_overlap) = self.correlation_data[trait] # Set some sane defaults trait_object.tissue_corr = 0 trait_object.tissue_pvalue = 0 trait_object.lit_corr = 0 if self.corr_type == "tissue": trait_object.tissue_corr = tissue_corr_data[trait][1] trait_object.tissue_pvalue = tissue_corr_data[trait][2] elif self.corr_type == "lit": trait_object.lit_corr = lit_corr_data[trait][1] self.correlation_results.append(trait_object) self.target_dataset.get_trait_info(self.correlation_results, self.target_dataset.group.species) if self.corr_type != "lit" and self.dataset.type == "ProbeSet" and self.target_dataset.type == "ProbeSet": self.do_lit_correlation_for_trait_list() if self.corr_type != "tissue" and self.dataset.type == "ProbeSet" and self.target_dataset.type == "ProbeSet": self.do_tissue_correlation_for_trait_list() self.json_results = generate_corr_json(self.correlation_results, self.this_trait, self.dataset, self.target_dataset)