def view_collection(): params = request.args uc_id = params['uc_id'] uc = next((collection for collection in g.user_session.user_collections if collection["id"] == uc_id)) traits = uc["members"] trait_obs = [] json_version = [] for atrait in traits: name, dataset_name = atrait.split(':') if dataset_name == "Temp": group = name.split("_")[2] dataset = create_dataset(dataset_name, dataset_type="Temp", group_name=group) trait_ob = create_trait(name=name, dataset=dataset) else: dataset = create_dataset(dataset_name) trait_ob = create_trait(name=name, dataset=dataset) trait_ob = retrieve_trait_info(trait_ob, dataset, get_qtl_info=True) trait_obs.append(trait_ob) json_version.append(jsonable(trait_ob)) collection_info = dict(trait_obs=trait_obs, uc=uc) if "json" in params: return json.dumps(json_version) else: return render_template("collections/view.html", **collection_info)
def get_diff_of_vals(new_vals: Dict, trait_id: str) -> Dict: """ Get the diff between current sample values and the values in the DB Given a dict of the changed values and the trait/dataset ID, return a Dict with keys corresponding to each sample with a changed value and a value that is a dict with keys for the old_value and new_value """ trait_name = trait_id.split(":")[0] dataset_name = trait_id.split(":")[1] trait_ob = create_trait(name=trait_name, dataset_name=dataset_name) old_vals = { sample: trait_ob.data[sample].value for sample in trait_ob.data } shared_samples = set.union(set(new_vals.keys()), set(old_vals.keys())) diff_dict = {} for sample in shared_samples: try: new_val = round(float(new_vals[sample]), 3) except: new_val = "x" try: old_val = round(float(old_vals[sample]), 3) except: old_val = "x" if new_val != old_val: diff_dict[sample] = {"new_val": new_val, "old_val": old_val} return diff_dict
def gen_covariates_file(this_dataset, covariates, samples): covariate_list = covariates.split(",") covariate_data_object = [] for covariate in covariate_list: this_covariate_data = [] trait_name = covariate.split(":")[0] dataset_ob = create_dataset(covariate.split(":")[1]) trait_ob = create_trait(dataset=dataset_ob, name=trait_name, cellid=None) this_dataset.group.get_samplelist() trait_samples = this_dataset.group.samplelist trait_sample_data = trait_ob.data for index, sample in enumerate(trait_samples): if sample in samples: if sample in trait_sample_data: sample_value = trait_sample_data[sample].value this_covariate_data.append(sample_value) else: this_covariate_data.append("-9") covariate_data_object.append(this_covariate_data) with open((f"{flat_files('mapping')}/" f"{this_dataset.group.name}_covariates.txt"), "w") as outfile: for i in range(len(covariate_data_object[0])): for this_covariate in covariate_data_object: outfile.write(str(this_covariate[i]) + "\t") outfile.write("\n")
def get_trait_db_obs(self, trait_db_list): self.trait_list = [] for i, trait_db in enumerate(trait_db_list): if i == (len(trait_db_list) - 1): break trait_name, dataset_name = trait_db.split(":") dataset_ob = data_set.create_dataset(dataset_name) trait_ob = create_trait(dataset=dataset_ob, name=trait_name, cellid=None) self.trait_list.append((trait_ob, dataset_ob))
def do_correlation(start_vars): assert ('db' in start_vars) assert ('target_db' in start_vars) assert ('trait_id' in start_vars) this_dataset = data_set.create_dataset(dataset_name=start_vars['db']) target_dataset = data_set.create_dataset( dataset_name=start_vars['target_db']) this_trait = create_trait(dataset=this_dataset, name=start_vars['trait_id']) this_trait = retrieve_sample_data(this_trait, this_dataset) corr_params = init_corr_params(start_vars) corr_results = calculate_results(this_trait, this_dataset, target_dataset, corr_params) #corr_results = collections.OrderedDict(sorted(corr_results.items(), key=lambda t: -abs(t[1][0]))) final_results = [] for _trait_counter, trait in enumerate( list(corr_results.keys())[:corr_params['return_count']]): if corr_params['type'] == "tissue": [sample_r, num_overlap, sample_p, symbol] = corr_results[trait] result_dict = { "trait": trait, "sample_r": sample_r, "#_strains": num_overlap, "p_value": sample_p, "symbol": symbol } elif corr_params['type'] == "literature" or corr_params[ 'type'] == "lit": [gene_id, sample_r] = corr_results[trait] result_dict = { "trait": trait, "sample_r": sample_r, "gene_id": gene_id } else: [sample_r, sample_p, num_overlap] = corr_results[trait] result_dict = { "trait": trait, "sample_r": sample_r, "#_strains": num_overlap, "p_value": sample_p } final_results.append(result_dict) # json_corr_results = generate_corr_json(final_corr_results, this_trait, this_dataset, target_dataset, for_api = True) return final_results
def create_target_this_trait(start_vars): """this function creates the required trait and target dataset for correlation""" if start_vars['dataset'] == "Temp": this_dataset = data_set.create_dataset(dataset_name="Temp", dataset_type="Temp", group_name=start_vars['group']) else: this_dataset = data_set.create_dataset( dataset_name=start_vars['dataset']) target_dataset = data_set.create_dataset( dataset_name=start_vars['corr_dataset']) this_trait = create_trait(dataset=this_dataset, name=start_vars['trait_id']) sample_data = () return (this_dataset, this_trait, target_dataset, sample_data)
def get_export_metadata(trait_id, dataset_name): dataset = data_set.create_dataset(dataset_name) this_trait = create_trait(dataset=dataset, name=trait_id, cellid=None, get_qtl_info=False) metadata = [] if dataset.type == "Publish": metadata.append(["Phenotype ID: " + trait_id]) metadata.append([ "Phenotype URL: " + "http://genenetwork.org/show_trait?trait_id=" + trait_id + "&dataset=" + dataset_name ]) metadata.append(["Group: " + dataset.group.name]) metadata.append([ "Phenotype: " + this_trait.description_display.replace(",", "\",\"") ]) metadata.append([ "Authors: " + (this_trait.authors if this_trait.authors else "N/A") ]) metadata.append( ["Title: " + (this_trait.title if this_trait.title else "N/A")]) metadata.append([ "Journal: " + (this_trait.journal if this_trait.journal else "N/A") ]) metadata.append([ "Dataset Link: http://gn1.genenetwork.org/webqtl/main.py?FormID=sharinginfo&InfoPageName=" + dataset.name ]) else: metadata.append(["Record ID: " + trait_id]) metadata.append([ "Trait URL: " + "http://genenetwork.org/show_trait?trait_id=" + trait_id + "&dataset=" + dataset_name ]) if this_trait.symbol: metadata.append(["Symbol: " + this_trait.symbol]) metadata.append(["Dataset: " + dataset.name]) metadata.append(["Group: " + dataset.group.name]) metadata.append([]) return metadata
def set_template_vars(start_vars, correlation_data): corr_type = start_vars['corr_type'] corr_method = start_vars['corr_sample_method'] if start_vars['dataset'] == "Temp": this_dataset_ob = create_dataset(dataset_name="Temp", dataset_type="Temp", group_name=start_vars['group']) else: this_dataset_ob = create_dataset(dataset_name=start_vars['dataset']) this_trait = create_trait(dataset=this_dataset_ob, name=start_vars['trait_id']) correlation_data['this_trait'] = jsonable(this_trait, this_dataset_ob) correlation_data['this_dataset'] = this_dataset_ob.as_dict() target_dataset_ob = create_dataset(correlation_data['target_dataset']) correlation_data['target_dataset'] = target_dataset_ob.as_dict() table_json = correlation_json_for_table(correlation_data, correlation_data['this_trait'], correlation_data['this_dataset'], target_dataset_ob) correlation_data['table_json'] = table_json if target_dataset_ob.type == "ProbeSet": filter_cols = [7, 6] elif target_dataset_ob.type == "Publish": filter_cols = [6, 0] else: filter_cols = [4, 0] correlation_data['corr_method'] = corr_method correlation_data['filter_cols'] = filter_cols correlation_data['header_fields'] = get_header_fields( target_dataset_ob.type, correlation_data['corr_method']) correlation_data['formatted_corr_type'] = get_formatted_corr_type( corr_type, corr_method) return correlation_data
def gen_covariates_file(this_dataset, covariates, samples): covariate_list = covariates.split(",") covariate_data_object = [] for covariate in covariate_list: this_covariate_data = [] trait_name = covariate.split(":")[0] dataset_name = covariate.split(":")[1] if dataset_name == "Temp": temp_group = trait_name.split("_")[2] dataset_ob = create_dataset(dataset_name="Temp", dataset_type="Temp", group_name=temp_group) else: dataset_ob = create_dataset(covariate.split(":")[1]) trait_ob = create_trait(dataset=dataset_ob, name=trait_name, cellid=None) this_dataset.group.get_samplelist() trait_samples = this_dataset.group.samplelist trait_sample_data = trait_ob.data for index, sample in enumerate(trait_samples): if sample in samples: if sample in trait_sample_data: sample_value = trait_sample_data[sample].value this_covariate_data.append(sample_value) else: this_covariate_data.append("-9") covariate_data_object.append(this_covariate_data) filename = "COVAR_" + generate_hash_of_string( this_dataset.name + str(covariate_data_object)).replace("/", "_") with open((f"{flat_files('mapping')}/" f"{filename}.txt"), "w") as outfile: for i in range(len(covariate_data_object[0])): for this_covariate in covariate_data_object: outfile.write(str(this_covariate[i]) + "\t") outfile.write("\n") return filename
def cofactors_to_dict(cofactors: str, dataset_ob, samples) -> Dict: """Given a string of cofactors, the trait being mapped's dataset ob, and list of samples, return cofactor data as a Dict """ cofactor_dict = {} if cofactors: dataset_ob.group.get_samplelist() sample_list = dataset_ob.group.samplelist for cofactor in cofactors.split(","): cofactor_name, cofactor_dataset = cofactor.split(":") if cofactor_dataset == dataset_ob.name: cofactor_dict[cofactor_name] = [] trait_ob = create_trait(dataset=dataset_ob, name=cofactor_name) sample_data = trait_ob.data for index, sample in enumerate(samples): if sample in sample_data: sample_value = str( round(float(sample_data[sample].value), 3)) cofactor_dict[cofactor_name].append(sample_value) else: cofactor_dict[cofactor_name].append("NA") return cofactor_dict
def __init__(self, params): if "Temp" in params['dataset_1']: self.dataset_1 = data_set.create_dataset(dataset_name = "Temp", dataset_type = "Temp", group_name = params['dataset_1'].split("_")[1]) else: self.dataset_1 = data_set.create_dataset(params['dataset_1']) if "Temp" in params['dataset_2']: self.dataset_2 = data_set.create_dataset(dataset_name = "Temp", dataset_type = "Temp", group_name = params['dataset_2'].split("_")[1]) else: self.dataset_2 = data_set.create_dataset(params['dataset_2']) #self.dataset_3 = data_set.create_dataset(params['dataset_3']) self.trait_1 = create_trait(name=params['trait_1'], dataset=self.dataset_1) self.trait_2 = create_trait(name=params['trait_2'], dataset=self.dataset_2) #self.trait_3 = create_trait(name=params['trait_3'], dataset=self.dataset_3) self.method = params['method'] primary_samples = self.dataset_1.group.samplelist if self.dataset_1.group.parlist != None: primary_samples += self.dataset_1.group.parlist if self.dataset_1.group.f1list != None: primary_samples += self.dataset_1.group.f1list self.trait_1 = retrieve_sample_data(self.trait_1, self.dataset_1, primary_samples) self.trait_2 = retrieve_sample_data(self.trait_2, self.dataset_2, primary_samples) samples_1, samples_2, num_overlap = corr_result_helpers.normalize_values_with_samples(self.trait_1.data, self.trait_2.data) self.data = [] self.indIDs = list(samples_1.keys()) vals_1 = [] for sample in list(samples_1.keys()): vals_1.append(samples_1[sample].value) self.data.append(vals_1) vals_2 = [] for sample in list(samples_2.keys()): vals_2.append(samples_2[sample].value) self.data.append(vals_2) slope, intercept, r_value, p_value, std_err = stats.linregress(vals_1, vals_2) if slope < 0.001: slope_string = '%.3E' % slope else: slope_string = '%.3f' % slope x_buffer = (max(vals_1) - min(vals_1))*0.1 y_buffer = (max(vals_2) - min(vals_2))*0.1 x_range = [min(vals_1) - x_buffer, max(vals_1) + x_buffer] y_range = [min(vals_2) - y_buffer, max(vals_2) + y_buffer] intercept_coords = get_intercept_coords(slope, intercept, x_range, y_range) rx = stats.rankdata(vals_1) ry = stats.rankdata(vals_2) self.rdata = [] self.rdata.append(rx.tolist()) self.rdata.append(ry.tolist()) srslope, srintercept, srr_value, srp_value, srstd_err = stats.linregress(rx, ry) if srslope < 0.001: srslope_string = '%.3E' % srslope else: srslope_string = '%.3f' % srslope x_buffer = (max(rx) - min(rx))*0.1 y_buffer = (max(ry) - min(ry))*0.1 sr_range = [min(rx) - x_buffer, max(rx) + x_buffer] sr_intercept_coords = get_intercept_coords(srslope, srintercept, sr_range, sr_range) self.collections_exist = "False" if g.user_session.num_collections > 0: self.collections_exist = "True" self.js_data = dict( data = self.data, rdata = self.rdata, indIDs = self.indIDs, trait_1 = self.trait_1.dataset.name + ": " + str(self.trait_1.name), trait_2 = self.trait_2.dataset.name + ": " + str(self.trait_2.name), samples_1 = samples_1, samples_2 = samples_2, num_overlap = num_overlap, vals_1 = vals_1, vals_2 = vals_2, x_range = x_range, y_range = y_range, sr_range = sr_range, intercept_coords = intercept_coords, sr_intercept_coords = sr_intercept_coords, slope = slope, slope_string = slope_string, intercept = intercept, r_value = r_value, p_value = p_value, srslope = srslope, srslope_string = srslope_string, srintercept = srintercept, srr_value = srr_value, srp_value = srp_value #trait3 = self.trait_3.data, #vals_3 = vals_3 ) self.jsdata = self.js_data
def __init__(self, user_id, kw): if 'trait_id' in kw and kw['dataset'] != "Temp": self.temp_trait = False self.trait_id = kw['trait_id'] helper_functions.get_species_dataset_trait(self, kw) elif 'group' in kw: self.temp_trait = True self.trait_id = "Temp_" + kw['species'] + "_" + kw['group'] + \ "_" + datetime.datetime.now().strftime("%m%d%H%M%S") self.temp_species = kw['species'] self.temp_group = kw['group'] self.dataset = data_set.create_dataset(dataset_name="Temp", dataset_type="Temp", group_name=self.temp_group) # Put values in Redis so they can be looked up later if # added to a collection Redis.set(self.trait_id, kw['trait_paste'], ex=ONE_YEAR) self.trait_vals = kw['trait_paste'].split() self.this_trait = create_trait(dataset=self.dataset, name=self.trait_id, cellid=None) else: self.temp_trait = True self.trait_id = kw['trait_id'] self.temp_species = self.trait_id.split("_")[1] self.temp_group = self.trait_id.split("_")[2] self.dataset = data_set.create_dataset(dataset_name="Temp", dataset_type="Temp", group_name=self.temp_group) self.this_trait = create_trait(dataset=self.dataset, name=self.trait_id, cellid=None) self.trait_vals = Redis.get(self.trait_id).split() self.resource_id = get_resource_id(self.dataset, self.trait_id) self.admin_status = get_highest_user_access_role( user_id=user_id, resource_id=(self.resource_id or ""), gn_proxy_url=GN_PROXY_URL) # ZS: Get verify/rna-seq link URLs try: blatsequence = self.this_trait.blatseq if not blatsequence: # XZ, 06/03/2009: ProbeSet name is not unique among platforms. We should use ProbeSet Id instead. query1 = """SELECT Probe.Sequence, Probe.Name FROM Probe, ProbeSet, ProbeSetFreeze, ProbeSetXRef WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND ProbeSetXRef.ProbeSetId = ProbeSet.Id AND ProbeSetFreeze.Name = '%s' AND ProbeSet.Name = '%s' AND Probe.ProbeSetId = ProbeSet.Id order by Probe.SerialOrder""" % ( self.this_trait.dataset.name, self.this_trait.name) seqs = g.db.execute(query1).fetchall() if not seqs: raise ValueError else: blatsequence = '' for seqt in seqs: if int(seqt[1][-1]) % 2 == 1: blatsequence += string.strip(seqt[0]) # --------Hongqiang add this part in order to not only blat ProbeSet, but also blat Probe blatsequence = '%3E' + self.this_trait.name + '%0A' + blatsequence + '%0A' # XZ, 06/03/2009: ProbeSet name is not unique among platforms. We should use ProbeSet Id instead. query2 = """SELECT Probe.Sequence, Probe.Name FROM Probe, ProbeSet, ProbeSetFreeze, ProbeSetXRef WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND ProbeSetXRef.ProbeSetId = ProbeSet.Id AND ProbeSetFreeze.Name = '%s' AND ProbeSet.Name = '%s' AND Probe.ProbeSetId = ProbeSet.Id order by Probe.SerialOrder""" % ( self.this_trait.dataset.name, self.this_trait.name) seqs = g.db.execute(query2).fetchall() for seqt in seqs: if int(seqt[1][-1]) % 2 == 1: blatsequence += '%3EProbe_' + \ seqt[1].strip() + '%0A' + seqt[0].strip() + '%0A' if self.dataset.group.species == "rat": self.UCSC_BLAT_URL = webqtlConfig.UCSC_BLAT % ('rat', 'rn6', blatsequence) self.UTHSC_BLAT_URL = "" elif self.dataset.group.species == "mouse": self.UCSC_BLAT_URL = webqtlConfig.UCSC_BLAT % ('mouse', 'mm10', blatsequence) self.UTHSC_BLAT_URL = webqtlConfig.UTHSC_BLAT % ( 'mouse', 'mm10', blatsequence) elif self.dataset.group.species == "human": self.UCSC_BLAT_URL = webqtlConfig.UCSC_BLAT % ('human', 'hg38', blatsequence) self.UTHSC_BLAT_URL = "" else: self.UCSC_BLAT_URL = "" self.UTHSC_BLAT_URL = "" except: self.UCSC_BLAT_URL = "" self.UTHSC_BLAT_URL = "" if self.dataset.type == "ProbeSet": self.show_probes = "True" trait_units = get_trait_units(self.this_trait) self.get_external_links() self.build_correlation_tools() self.ncbi_summary = get_ncbi_summary(self.this_trait) # Get nearest marker for composite mapping if not self.temp_trait: if check_if_attr_exists( self.this_trait, 'locus_chr' ) and self.dataset.type != "Geno" and self.dataset.type != "Publish": self.nearest_marker = get_nearest_marker( self.this_trait, self.dataset) else: self.nearest_marker = "" self.make_sample_lists() trait_vals_by_group = [] for sample_type in self.sample_groups: trait_vals_by_group.append(get_trait_vals(sample_type.sample_list)) self.max_digits_by_group = get_max_digits(trait_vals_by_group) self.qnorm_vals = quantile_normalize_vals(self.sample_groups, trait_vals_by_group) self.z_scores = get_z_scores(self.sample_groups, trait_vals_by_group) self.temp_uuid = uuid.uuid4() self.sample_group_types = OrderedDict() if len(self.sample_groups) > 1: self.sample_group_types[ 'samples_primary'] = self.dataset.group.name self.sample_group_types['samples_other'] = "Other" self.sample_group_types['samples_all'] = "All" else: self.sample_group_types[ 'samples_primary'] = self.dataset.group.name sample_lists = [group.sample_list for group in self.sample_groups] self.categorical_var_list = [] self.numerical_var_list = [] if not self.temp_trait: # ZS: Only using first samplelist, since I think mapping only uses those samples self.categorical_var_list = get_categorical_variables( self.this_trait, self.sample_groups[0]) self.numerical_var_list = get_numerical_variables( self.this_trait, self.sample_groups[0]) # ZS: Get list of chromosomes to select for mapping self.chr_list = [["All", -1]] for i, this_chr in enumerate( self.dataset.species.chromosomes.chromosomes): self.chr_list.append([ self.dataset.species.chromosomes.chromosomes[this_chr].name, i ]) self.genofiles = self.dataset.group.get_genofiles() study_samplelist_json = self.dataset.group.get_study_samplelists() self.study_samplelists = [ study["title"] for study in study_samplelist_json ] # ZS: No need to grab scales from .geno file unless it's using # a mapping method that reads .geno files if "QTLReaper" or "R/qtl" in dataset.group.mapping_names: if self.genofiles: self.scales_in_geno = get_genotype_scales(self.genofiles) else: self.scales_in_geno = get_genotype_scales( self.dataset.group.name + ".geno") else: self.scales_in_geno = {} self.has_num_cases = has_num_cases(self.this_trait) # ZS: Needed to know whether to display bar chart + get max # sample name length in order to set table column width self.num_values = 0 # ZS: So it knows whether to display the Binary R/qtl mapping # method, which doesn't work unless all values are 0 or 1 self.binary = "true" # ZS: Since we don't want to show log2 transform option for # situations where it doesn't make sense self.negative_vals_exist = "false" max_samplename_width = 1 for group in self.sample_groups: for sample in group.sample_list: if len(sample.name) > max_samplename_width: max_samplename_width = len(sample.name) if sample.display_value != "x": self.num_values += 1 if sample.display_value != 0 or sample.display_value != 1: self.binary = "false" if sample.value < 0: self.negative_vals_exist = "true" # ZS: Check whether any attributes have few enough distinct # values to show the "Block samples by group" option self.categorical_attr_exists = "false" for attribute in self.sample_groups[0].attributes: if len(self.sample_groups[0].attributes[attribute].distinct_values ) <= 10: self.categorical_attr_exists = "true" break sample_column_width = max_samplename_width * 8 self.stats_table_width, self.trait_table_width = get_table_widths( self.sample_groups, sample_column_width, self.has_num_cases) if self.num_values >= 5000: self.maf = 0.01 else: self.maf = 0.05 trait_symbol = None short_description = None if not self.temp_trait: if self.this_trait.symbol: trait_symbol = self.this_trait.symbol short_description = trait_symbol elif hasattr(self.this_trait, 'post_publication_abbreviation'): short_description = self.this_trait.post_publication_abbreviation elif hasattr(self.this_trait, 'pre_publication_abbreviation'): short_description = self.this_trait.pre_publication_abbreviation # Todo: Add back in the ones we actually need from below, as we discover we need them hddn = OrderedDict() if self.dataset.group.allsamples: hddn['allsamples'] = ','.join(self.dataset.group.allsamples) hddn['primary_samples'] = ','.join(self.primary_sample_names) hddn['trait_id'] = self.trait_id hddn['trait_display_name'] = self.this_trait.display_name hddn['dataset'] = self.dataset.name hddn['temp_trait'] = False if self.temp_trait: hddn['temp_trait'] = True hddn['group'] = self.temp_group hddn['species'] = self.temp_species else: hddn['group'] = self.dataset.group.name hddn['species'] = self.dataset.group.species hddn['use_outliers'] = False hddn['method'] = "gemma" hddn['selected_chr'] = -1 hddn['mapping_display_all'] = True hddn['suggestive'] = 0 hddn['study_samplelists'] = json.dumps(study_samplelist_json) hddn['num_perm'] = 0 hddn['categorical_vars'] = "" if self.categorical_var_list: hddn['categorical_vars'] = ",".join(self.categorical_var_list) hddn['manhattan_plot'] = "" hddn['control_marker'] = "" if not self.temp_trait: if hasattr( self.this_trait, 'locus_chr' ) and self.this_trait.locus_chr != "" and self.dataset.type != "Geno" and self.dataset.type != "Publish": hddn['control_marker'] = self.nearest_marker hddn['do_control'] = False hddn['maf'] = 0.05 hddn['mapping_scale'] = "physic" hddn['compare_traits'] = [] hddn['export_data'] = "" hddn['export_format'] = "excel" if len(self.scales_in_geno) < 2 and bool(self.scales_in_geno): hddn['mapping_scale'] = self.scales_in_geno[list( self.scales_in_geno.keys())[0]][0][0] # We'll need access to this_trait and hddn in the Jinja2 # Template, so we put it inside self self.hddn = hddn js_data = dict(trait_id=self.trait_id, trait_symbol=trait_symbol, max_digits=self.max_digits_by_group, short_description=short_description, unit_type=trait_units, dataset_type=self.dataset.type, species=self.dataset.group.species, scales_in_geno=self.scales_in_geno, data_scale=self.dataset.data_scale, sample_group_types=self.sample_group_types, sample_lists=sample_lists, se_exists=self.sample_groups[0].se_exists, has_num_cases=self.has_num_cases, attributes=self.sample_groups[0].attributes, categorical_attr_exists=self.categorical_attr_exists, categorical_vars=",".join(self.categorical_var_list), num_values=self.num_values, qnorm_values=self.qnorm_vals, zscore_values=self.z_scores, sample_column_width=sample_column_width, temp_uuid=self.temp_uuid) self.js_data = js_data
def run_analysis(self, requestform): logger.info("Starting CTL analysis on dataset") self.trait_db_list = [ trait.strip() for trait in requestform['trait_list'].split(',') ] self.trait_db_list = [x for x in self.trait_db_list if x] logger.debug("strategy:", requestform.get("strategy")) strategy = requestform.get("strategy") logger.debug("nperm:", requestform.get("nperm")) nperm = int(requestform.get("nperm")) logger.debug("parametric:", requestform.get("parametric")) parametric = bool(requestform.get("parametric")) logger.debug("significance:", requestform.get("significance")) significance = float(requestform.get("significance")) # Get the name of the .geno file belonging to the first phenotype datasetname = self.trait_db_list[0].split(":")[1] dataset = data_set.create_dataset(datasetname) genofilelocation = locate(dataset.group.name + ".geno", "genotype") parser = genofile_parser.ConvertGenoFile(genofilelocation) parser.process_csv() logger.debug("dataset group: ", dataset.group) # Create a genotype matrix individuals = parser.individuals markers = [] markernames = [] for marker in parser.markers: markernames.append(marker["name"]) markers.append(marker["genotypes"]) genotypes = list(itertools.chain(*markers)) logger.debug( len(genotypes) / len(individuals), "==", len(parser.markers)) rGeno = r_t( ro.r.matrix(r_unlist(genotypes), nrow=len(markernames), ncol=len(individuals), dimnames=r_list(markernames, individuals), byrow=True)) # Create a phenotype matrix traits = [] for trait in self.trait_db_list: logger.debug("retrieving data for", trait) if trait != "": ts = trait.split(':') gt = create_trait(name=ts[0], dataset_name=ts[1]) gt = retrieve_sample_data(gt, dataset, individuals) for ind in individuals: if ind in list(gt.data.keys()): traits.append(gt.data[ind].value) else: traits.append("-999") rPheno = r_t( ro.r.matrix(r_as_numeric(r_unlist(traits)), nrow=len(self.trait_db_list), ncol=len(individuals), dimnames=r_list(self.trait_db_list, individuals), byrow=True)) logger.debug(rPheno) # Use a data frame to store the objects rPheno = r_data_frame(rPheno, check_names=False) rGeno = r_data_frame(rGeno, check_names=False) # Debug: Print the genotype and phenotype files to disk #r_write_table(rGeno, "~/outputGN/geno.csv") #r_write_table(rPheno, "~/outputGN/pheno.csv") # Perform the CTL scan res = self.r_CTLscan(rGeno, rPheno, strategy=strategy, nperm=nperm, parametric=parametric, nthreads=6) # Get significant interactions significant = self.r_CTLsignificant(res, significance=significance) # Create an image for output self.results = {} self.results['imgurl1'] = webqtlUtil.genRandStr("CTLline_") + ".png" self.results['imgloc1'] = GENERATED_IMAGE_DIR + self.results['imgurl1'] self.results['ctlresult'] = significant self.results[ 'requestform'] = requestform # Store the user specified parameters for the output page # Create the lineplot r_png(self.results['imgloc1'], width=1000, height=600, type='cairo-png') self.r_lineplot(res, significance=significance) r_dev_off() n = 2 # We start from 2, since R starts from 1 :) for trait in self.trait_db_list: # Create the QTL like CTL plots self.results['imgurl' + str(n)] = webqtlUtil.genRandStr("CTL_") + ".png" self.results[ 'imgloc' + str(n)] = GENERATED_IMAGE_DIR + self.results['imgurl' + str(n)] r_png(self.results['imgloc' + str(n)], width=1000, height=600, type='cairo-png') self.r_plotCTLobject(res, (n - 1), significance=significance, main='Phenotype ' + trait) r_dev_off() n = n + 1 # Flush any output from R sys.stdout.flush() # Create the interactive graph for cytoscape visualization (Nodes and Edges) if not isinstance(significant, ri.RNULLType): for x in range(len(significant[0])): logger.debug(significant[0][x], significant[1][x], significant[2][x]) # Debug to console tsS = significant[0][x].split(':') # Source tsT = significant[2][x].split(':') # Target gtS = create_trait( name=tsS[0], dataset_name=tsS[1]) # Retrieve Source info from the DB gtT = create_trait( name=tsT[0], dataset_name=tsT[1]) # Retrieve Target info from the DB self.addNode(gtS) self.addNode(gtT) self.addEdge(gtS, gtT, significant, x) significant[0][x] = "{} ({})".format( gtS.symbol, gtS.name) # Update the trait name for the displayed table significant[2][x] = "{} ({})".format( gtT.symbol, gtT.name) # Update the trait name for the displayed table self.elements = json.dumps(self.nodes_list + self.edges_list)
def do_mapping_for_api(start_vars): assert ('db' in start_vars) assert ('trait_id' in start_vars) dataset = data_set.create_dataset(dataset_name=start_vars['db']) dataset.group.get_markers() this_trait = create_trait(dataset=dataset, name=start_vars['trait_id']) this_trait = retrieve_sample_data(this_trait, dataset) samples = [] vals = [] for sample in dataset.group.samplelist: in_trait_data = False for item in this_trait.data: if this_trait.data[item].name == sample: value = str(this_trait.data[item].value) samples.append(item) vals.append(value) in_trait_data = True break if not in_trait_data: vals.append("x") mapping_params = initialize_parameters(start_vars, dataset, this_trait) covariates = "" #ZS: It seems to take an empty string as default. This should probably be changed. if mapping_params['mapping_method'] == "gemma": header_row = ["name", "chr", "Mb", "lod_score", "p_value"] if mapping_params[ 'use_loco'] == "True": #ZS: gemma_mapping returns both results and the filename for LOCO, so need to only grab the former for api result_markers = gemma_mapping.run_gemma( this_trait, dataset, samples, vals, covariates, mapping_params['use_loco'], mapping_params['maf'])[0] else: result_markers = gemma_mapping.run_gemma( this_trait, dataset, samples, vals, covariates, mapping_params['use_loco'], mapping_params['maf']) elif mapping_params['mapping_method'] == "rqtl": header_row = ["name", "chr", "cM", "lod_score"] if mapping_params['num_perm'] > 0: _sperm_output, _suggestive, _significant, result_markers = rqtl_mapping.run_rqtl_geno( vals, dataset, mapping_params['rqtl_method'], mapping_params['rqtl_model'], mapping_params['perm_check'], mapping_params['num_perm'], mapping_params['do_control'], mapping_params['control_marker'], mapping_params['manhattan_plot'], mapping_params['pair_scan']) else: result_markers = rqtl_mapping.run_rqtl_geno( vals, dataset, mapping_params['rqtl_method'], mapping_params['rqtl_model'], mapping_params['perm_check'], mapping_params['num_perm'], mapping_params['do_control'], mapping_params['control_marker'], mapping_params['manhattan_plot'], mapping_params['pair_scan']) if mapping_params['limit_to']: result_markers = result_markers[:mapping_params['limit_to']] if mapping_params['format'] == "csv": output_rows = [] output_rows.append(header_row) for marker in result_markers: this_row = [marker[header] for header in header_row] output_rows.append(this_row) return output_rows, mapping_params['format'] elif mapping_params['format'] == "json": return result_markers, mapping_params['format'] else: return result_markers, None
def add_cofactors(cross, this_dataset, covariates, samples): ro.numpy2ri.activate() covariate_list = covariates.split(",") covar_name_string = "c(" for i, covariate in enumerate(covariate_list): logger.info("Covariate: " + covariate) this_covar_data = [] covar_as_string = "c(" trait_name = covariate.split(":")[0] dataset_ob = create_dataset(covariate.split(":")[1]) trait_ob = create_trait(dataset=dataset_ob, name=trait_name, cellid=None) this_dataset.group.get_samplelist() trait_samples = this_dataset.group.samplelist trait_sample_data = trait_ob.data for index, sample in enumerate(samples): if sample in trait_samples: if sample in trait_sample_data: sample_value = trait_sample_data[sample].value this_covar_data.append(sample_value) else: this_covar_data.append("NA") for j, item in enumerate(this_covar_data): if j < (len(this_covar_data) - 1): covar_as_string += str(item) + "," else: covar_as_string += str(item) covar_as_string += ")" datatype = get_trait_data_type(covariate) logger.info("Covariate: " + covariate + " is of type: " + datatype) if (datatype == "categorical"): # Cat variable logger.info("call of add_categorical_covar") cross, col_names = add_categorical_covar( cross, covar_as_string, i) # Expand and add it to the cross logger.info("add_categorical_covar returned") for z, col_name in enumerate( col_names): # Go through the additional covar names if i < (len(covariate_list) - 1): covar_name_string += '"' + col_name + '", ' else: if (z < (len(col_names) - 1)): covar_name_string += '"' + col_name + '", ' else: covar_name_string += '"' + col_name + '"' else: col_name = "covar_" + str(i) cross = add_phenotype(cross, covar_as_string, col_name) if i < (len(covariate_list) - 1): covar_name_string += '"' + col_name + '", ' else: covar_name_string += '"' + col_name + '"' covar_name_string += ")" covars_ob = pull_var("trait_covars", cross, covar_name_string) return cross, covars_ob
def export_search_results_csv(targs): table_data = json.loads(targs['export_data']) table_rows = table_data['rows'] now = datetime.datetime.now() time_str = now.strftime('%H:%M_%d%B%Y') if 'file_name' in targs: zip_file_name = targs['file_name'] + "_export_" + time_str else: zip_file_name = "export_" + time_str metadata = [] if 'database_name' in targs: if targs['database_name'] != "None": metadata.append(["Data Set: " + targs['database_name']]) if 'accession_id' in targs: if targs['accession_id'] != "None": metadata.append([ "Metadata Link: http://genenetwork.org/webqtl/main.py?FormID=sharinginfo&GN_AccessionId=" + targs['accession_id'] ]) metadata.append( ["Export Date: " + datetime.datetime.now().strftime("%B %d, %Y")]) metadata.append( ["Export Time: " + datetime.datetime.now().strftime("%H:%M GMT")]) if 'search_string' in targs: if targs['search_string'] != "None": metadata.append(["Search Query: " + targs['search_string']]) if 'filter_term' in targs: if targs['filter_term'] != "None": metadata.append(["Search Filter Terms: " + targs['filter_term']]) metadata.append(["Exported Row Number: " + str(len(table_rows))]) metadata.append([ "Funding for The GeneNetwork: NIGMS (R01 GM123489, 2017-2021), NIDA (P30 DA044223, 2017-2022), NIA (R01AG043930, 2013-2018), NIAAA (U01 AA016662, U01 AA013499, U24 AA013513, U01 AA014425, 2006-2017), NIDA/NIMH/NIAAA (P20-DA 21131, 2001-2012), NCI MMHCC (U01CA105417), NCRR/BIRN (U24 RR021760)" ]) metadata.append([]) trait_list = [] for trait in table_rows: trait_name, dataset_name, _hash = trait.split(":") trait_ob = create_trait(name=trait_name, dataset_name=dataset_name) trait_ob = retrieve_trait_info(trait_ob, trait_ob.dataset, get_qtl_info=True) trait_list.append(trait_ob) table_headers = [ 'Index', 'URL', 'Species', 'Group', 'Dataset', 'Record ID', 'Symbol', 'Description', 'ProbeTarget', 'PubMed_ID', 'Chr', 'Mb', 'Alias', 'Gene_ID', 'Homologene_ID', 'UniGene_ID', 'Strand_Probe', 'Probe_set_specificity', 'Probe_set_BLAT_score', 'Probe_set_BLAT_Mb_start', 'Probe_set_BLAT_Mb_end', 'QTL_Chr', 'QTL_Mb', 'Locus_at_Peak', 'Max_LRS', 'P_value_of_MAX', 'Mean_Expression' ] traits_by_group = sort_traits_by_group(trait_list) file_list = [] for group in list(traits_by_group.keys()): group_traits = traits_by_group[group] buff = io.StringIO() writer = csv.writer(buff) csv_rows = [] sample_headers = [] for sample in group_traits[0].dataset.group.samplelist: sample_headers.append(sample) sample_headers.append(sample + "_SE") full_headers = table_headers + sample_headers for metadata_row in metadata: writer.writerow(metadata_row) csv_rows.append(full_headers) for i, trait in enumerate(group_traits): if getattr(trait, "symbol", None): trait_symbol = getattr(trait, "symbol") elif getattr(trait, "abbreviation", None): trait_symbol = getattr(trait, "abbreviation") else: trait_symbol = "N/A" row_contents = [ i + 1, "https://genenetwork.org/show_trait?trait_id=" + str(trait.name) + "&dataset=" + str(trait.dataset.name), trait.dataset.group.species, trait.dataset.group.name, trait.dataset.name, trait.name, trait_symbol, getattr(trait, "description_display", "N/A"), getattr(trait, "probe_target_description", "N/A"), getattr(trait, "pubmed_id", "N/A"), getattr(trait, "chr", "N/A"), getattr(trait, "mb", "N/A"), trait.alias_fmt, getattr(trait, "geneid", "N/A"), getattr(trait, "homologeneid", "N/A"), getattr(trait, "unigeneid", "N/A"), getattr(trait, "strand_probe", "N/A"), getattr(trait, "probe_set_specificity", "N/A"), getattr(trait, "probe_set_blat_score", "N/A"), getattr(trait, "probe_set_blat_mb_start", "N/A"), getattr(trait, "probe_set_blat_mb_end", "N/A"), getattr(trait, "locus_chr", "N/A"), getattr(trait, "locus_mb", "N/A"), getattr(trait, "locus", "N/A"), getattr(trait, "lrs", "N/A"), getattr(trait, "pvalue", "N/A"), getattr(trait, "mean", "N/A") ] for sample in trait.dataset.group.samplelist: if sample in trait.data: row_contents += [ trait.data[sample].value, trait.data[sample].variance ] else: row_contents += ["x", "x"] csv_rows.append(row_contents) csv_rows = list( map(list, itertools.zip_longest(*[row for row in csv_rows]))) writer.writerows(csv_rows) csv_data = buff.getvalue() buff.close() file_name = group + "_traits.csv" file_list.append([file_name, csv_data]) return file_list
def __init__(self, start_vars): # get trait list from db (database name) # calculate correlation with Base vector and targets # Check parameters assert('corr_type' in start_vars) assert(is_str(start_vars['corr_type'])) assert('dataset' in start_vars) # assert('group' in start_vars) permitted to be empty? assert('corr_sample_method' in start_vars) assert('corr_samples_group' in start_vars) assert('corr_dataset' in start_vars) assert('corr_return_results' in start_vars) if 'loc_chr' in start_vars: assert('min_loc_mb' in start_vars) assert('max_loc_mb' in start_vars) with Bench("Doing correlations"): if start_vars['dataset'] == "Temp": self.dataset = data_set.create_dataset(dataset_name = "Temp", dataset_type = "Temp", group_name = start_vars['group']) self.trait_id = start_vars['trait_id'] self.this_trait = create_trait(dataset=self.dataset, name=self.trait_id, cellid=None) else: helper_functions.get_species_dataset_trait(self, start_vars) corr_samples_group = start_vars['corr_samples_group'] self.sample_data = {} self.corr_type = start_vars['corr_type'] self.corr_method = start_vars['corr_sample_method'] self.min_expr = get_float(start_vars, 'min_expr') self.p_range_lower = get_float(start_vars, 'p_range_lower', -1.0) self.p_range_upper = get_float(start_vars, 'p_range_upper', 1.0) if ('loc_chr' in start_vars and 'min_loc_mb' in start_vars and 'max_loc_mb' in start_vars): self.location_type = get_string(start_vars, 'location_type') self.location_chr = get_string(start_vars, 'loc_chr') self.min_location_mb = get_int(start_vars, 'min_loc_mb') self.max_location_mb = get_int(start_vars, 'max_loc_mb') else: self.location_type = self.location_chr = self.min_location_mb = self.max_location_mb = None self.get_formatted_corr_type() self.return_number = int(start_vars['corr_return_results']) #The two if statements below append samples to the sample list based upon whether the user #rselected Primary Samples Only, Other Samples Only, or All Samples primary_samples = self.dataset.group.samplelist if self.dataset.group.parlist != None: primary_samples += self.dataset.group.parlist if self.dataset.group.f1list != None: primary_samples += self.dataset.group.f1list #If either BXD/whatever Only or All Samples, append all of that group's samplelist if corr_samples_group != 'samples_other': self.process_samples(start_vars, primary_samples) #If either Non-BXD/whatever or All Samples, get all samples from this_trait.data and #exclude the primary samples (because they would have been added in the previous #if statement if the user selected All Samples) if corr_samples_group != 'samples_primary': if corr_samples_group == 'samples_other': primary_samples = [x for x in primary_samples if x not in ( self.dataset.group.parlist + self.dataset.group.f1list)] self.process_samples(start_vars, list(self.this_trait.data.keys()), primary_samples) self.target_dataset = data_set.create_dataset(start_vars['corr_dataset']) self.target_dataset.get_trait_data(list(self.sample_data.keys())) self.header_fields = get_header_fields(self.target_dataset.type, self.corr_method) if self.target_dataset.type == "ProbeSet": self.filter_cols = [7, 6] elif self.target_dataset.type == "Publish": self.filter_cols = [6, 0] else: self.filter_cols = [4, 0] self.correlation_results = [] self.correlation_data = {} if self.corr_type == "tissue": self.trait_symbol_dict = self.dataset.retrieve_genes("Symbol") tissue_corr_data = self.do_tissue_correlation_for_all_traits() if tissue_corr_data != None: for trait in list(tissue_corr_data.keys())[:self.return_number]: self.get_sample_r_and_p_values(trait, self.target_dataset.trait_data[trait]) else: for trait, values in list(self.target_dataset.trait_data.items()): self.get_sample_r_and_p_values(trait, values) elif self.corr_type == "lit": self.trait_geneid_dict = self.dataset.retrieve_genes("GeneId") lit_corr_data = self.do_lit_correlation_for_all_traits() for trait in list(lit_corr_data.keys())[:self.return_number]: self.get_sample_r_and_p_values(trait, self.target_dataset.trait_data[trait]) elif self.corr_type == "sample": for trait, values in list(self.target_dataset.trait_data.items()): self.get_sample_r_and_p_values(trait, values) self.correlation_data = collections.OrderedDict(sorted(list(self.correlation_data.items()), key=lambda t: -abs(t[1][0]))) #ZS: Convert min/max chromosome to an int for the location range option range_chr_as_int = None for order_id, chr_info in list(self.dataset.species.chromosomes.chromosomes.items()): if 'loc_chr' in start_vars: if chr_info.name == self.location_chr: range_chr_as_int = order_id for _trait_counter, trait in enumerate(list(self.correlation_data.keys())[:self.return_number]): trait_object = create_trait(dataset=self.target_dataset, name=trait, get_qtl_info=True, get_sample_info=False) if not trait_object: continue chr_as_int = 0 for order_id, chr_info in list(self.dataset.species.chromosomes.chromosomes.items()): if self.location_type == "highest_lod": if chr_info.name == trait_object.locus_chr: chr_as_int = order_id else: if chr_info.name == trait_object.chr: chr_as_int = order_id if (float(self.correlation_data[trait][0]) >= self.p_range_lower and float(self.correlation_data[trait][0]) <= self.p_range_upper): if (self.target_dataset.type == "ProbeSet" or self.target_dataset.type == "Publish") and bool(trait_object.mean): if (self.min_expr != None) and (float(trait_object.mean) < self.min_expr): continue if range_chr_as_int != None and (chr_as_int != range_chr_as_int): continue if self.location_type == "highest_lod": if (self.min_location_mb != None) and (float(trait_object.locus_mb) < float(self.min_location_mb)): continue if (self.max_location_mb != None) and (float(trait_object.locus_mb) > float(self.max_location_mb)): continue else: if (self.min_location_mb != None) and (float(trait_object.mb) < float(self.min_location_mb)): continue if (self.max_location_mb != None) and (float(trait_object.mb) > float(self.max_location_mb)): continue (trait_object.sample_r, trait_object.sample_p, trait_object.num_overlap) = self.correlation_data[trait] # Set some sane defaults trait_object.tissue_corr = 0 trait_object.tissue_pvalue = 0 trait_object.lit_corr = 0 if self.corr_type == "tissue" and tissue_corr_data != None: trait_object.tissue_corr = tissue_corr_data[trait][1] trait_object.tissue_pvalue = tissue_corr_data[trait][2] elif self.corr_type == "lit": trait_object.lit_corr = lit_corr_data[trait][1] self.correlation_results.append(trait_object) if self.corr_type != "lit" and self.dataset.type == "ProbeSet" and self.target_dataset.type == "ProbeSet": self.do_lit_correlation_for_trait_list() if self.corr_type != "tissue" and self.dataset.type == "ProbeSet" and self.target_dataset.type == "ProbeSet": self.do_tissue_correlation_for_trait_list() self.json_results = generate_corr_json(self.correlation_results, self.this_trait, self.dataset, self.target_dataset)
def correlation_json_for_table(correlation_data, this_trait, this_dataset, target_dataset_ob): """Return JSON data for use with the DataTable in the correlation result page Keyword arguments: correlation_data -- Correlation results this_trait -- Trait being correlated against a dataset, as a dict this_dataset -- Dataset of this_trait, as a dict target_dataset_ob - Target dataset, as a Dataset ob """ this_trait = correlation_data['this_trait'] this_dataset = correlation_data['this_dataset'] target_dataset = target_dataset_ob.as_dict() corr_results = correlation_data['correlation_results'] results_list = [] file_name = f"{target_dataset['name']}_metadata.json" file_path = os.path.join(TMPDIR, file_name) new_traits_metadata = {} try: with open(file_path, "r+") as file_handler: dataset_metadata = json.load(file_handler) except FileNotFoundError: Path(file_path).touch(exist_ok=True) dataset_metadata = {} for i, trait_dict in enumerate(corr_results): trait_name = list(trait_dict.keys())[0] trait = trait_dict[trait_name] target_trait = dataset_metadata.get(trait_name) if target_trait is None: target_trait_ob = create_trait(dataset=target_dataset_ob, name=trait_name, get_qtl_info=True) target_trait = jsonable(target_trait_ob, target_dataset_ob) new_traits_metadata[trait_name] = target_trait if target_trait['view'] == False: continue results_dict = {} results_dict['index'] = i + 1 results_dict['trait_id'] = target_trait['name'] results_dict['dataset'] = target_dataset['name'] results_dict['hmac'] = hmac.data_hmac('{}:{}'.format( target_trait['name'], target_dataset['name'])) results_dict['sample_r'] = f"{float(trait['corr_coefficient']):.3f}" results_dict['num_overlap'] = trait['num_overlap'] results_dict['sample_p'] = f"{float(trait['p_value']):.3e}" if target_dataset['type'] == "ProbeSet": results_dict['symbol'] = target_trait['symbol'] results_dict['description'] = "N/A" results_dict['location'] = target_trait['location'] results_dict['mean'] = "N/A" results_dict['additive'] = "N/A" if bool(target_trait['description']): results_dict['description'] = target_trait['description'] if bool(target_trait['mean']): results_dict['mean'] = f"{float(target_trait['mean']):.3f}" try: results_dict[ 'lod_score'] = f"{float(target_trait['lrs_score']) / 4.61:.1f}" except: results_dict['lod_score'] = "N/A" results_dict['lrs_location'] = target_trait['lrs_location'] if bool(target_trait['additive']): results_dict[ 'additive'] = f"{float(target_trait['additive']):.3f}" results_dict['lit_corr'] = "--" results_dict['tissue_corr'] = "--" results_dict['tissue_pvalue'] = "--" if this_dataset['type'] == "ProbeSet": if 'lit_corr' in trait: results_dict[ 'lit_corr'] = f"{float(trait['lit_corr']):.3f}" if 'tissue_corr' in trait: results_dict[ 'tissue_corr'] = f"{float(trait['tissue_corr']):.3f}" results_dict[ 'tissue_pvalue'] = f"{float(trait['tissue_p_val']):.3e}" elif target_dataset['type'] == "Publish": results_dict['abbreviation_display'] = "N/A" results_dict['description'] = "N/A" results_dict['mean'] = "N/A" results_dict['authors_display'] = "N/A" results_dict['additive'] = "N/A" results_dict['pubmed_link'] = "N/A" results_dict['pubmed_text'] = "N/A" if bool(target_trait['abbreviation']): results_dict['abbreviation_display'] = target_trait[ 'abbreviation'] if bool(target_trait['description']): results_dict['description'] = target_trait['description'] if bool(target_trait['mean']): results_dict['mean'] = f"{float(target_trait['mean']):.3f}" if bool(target_trait['authors']): authors_list = target_trait['authors'].split(',') if len(authors_list) > 6: results_dict['authors_display'] = ", ".join( authors_list[:6]) + ", et al." else: results_dict['authors_display'] = target_trait['authors'] if 'pubmed_id' in target_trait: results_dict['pubmed_link'] = target_trait['pubmed_link'] results_dict['pubmed_text'] = target_trait['pubmed_text'] try: results_dict[ 'lod_score'] = f"{float(target_trait['lrs_score']) / 4.61:.1f}" except: results_dict['lod_score'] = "N/A" results_dict['lrs_location'] = target_trait['lrs_location'] if bool(target_trait['additive']): results_dict[ 'additive'] = f"{float(target_trait['additive']):.3f}" else: results_dict['location'] = target_trait['location'] results_list.append(results_dict) if bool(new_traits_metadata): # that means new traits exists dataset_metadata.update(new_traits_metadata) with open(file_path, "w+") as file_handler: json.dump(dataset_metadata, file_handler) return json.dumps(results_list)
def __init__(self, kw): assert('type' in kw) assert('terms' in kw) self.type = kw['type'] self.terms = kw['terms'] assert(is_str(self.type)) if self.type == "gene": sql = """ SELECT Species.`Name` AS species_name, InbredSet.`Name` AS inbredset_name, Tissue.`Name` AS tissue_name, ProbeSetFreeze.Name AS probesetfreeze_name, ProbeSetFreeze.FullName AS probesetfreeze_fullname, ProbeSet.Name AS probeset_name, ProbeSet.Symbol AS probeset_symbol, CAST(ProbeSet.`description` AS BINARY) AS probeset_description, ProbeSet.Chr AS chr, ProbeSet.Mb AS mb, ProbeSetXRef.Mean AS mean, ProbeSetXRef.LRS AS lrs, ProbeSetXRef.`Locus` AS locus, ProbeSetXRef.`pValue` AS pvalue, ProbeSetXRef.`additive` AS additive, ProbeSetFreeze.Id AS probesetfreeze_id, Geno.Chr as geno_chr, Geno.Mb as geno_mb FROM Species INNER JOIN InbredSet ON InbredSet.`SpeciesId`=Species.`Id` INNER JOIN ProbeFreeze ON ProbeFreeze.InbredSetId=InbredSet.`Id` INNER JOIN Tissue ON ProbeFreeze.`TissueId`=Tissue.`Id` INNER JOIN ProbeSetFreeze ON ProbeSetFreeze.ProbeFreezeId=ProbeFreeze.Id INNER JOIN ProbeSetXRef ON ProbeSetXRef.ProbeSetFreezeId=ProbeSetFreeze.Id INNER JOIN ProbeSet ON ProbeSet.Id = ProbeSetXRef.ProbeSetId LEFT JOIN Geno ON ProbeSetXRef.Locus = Geno.Name AND Geno.SpeciesId = Species.Id WHERE ( MATCH (ProbeSet.Name,ProbeSet.description,ProbeSet.symbol,ProbeSet.alias,ProbeSet.GenbankId, ProbeSet.UniGeneId, ProbeSet.Probe_Target_Description) AGAINST ('%s' IN BOOLEAN MODE) ) AND ProbeSetFreeze.confidentiality < 1 AND ProbeSetFreeze.public > 0 ORDER BY species_name, inbredset_name, tissue_name, probesetfreeze_name, probeset_name LIMIT 6000 """ % (self.terms) with Bench("Running query"): logger.sql(sql) re = g.db.execute(sql).fetchall() trait_list = [] dataset_to_permissions = {} with Bench("Creating trait objects"): for i, line in enumerate(re): this_trait = {} this_trait['index'] = i + 1 this_trait['name'] = line[5] this_trait['dataset'] = line[3] this_trait['dataset_fullname'] = line[4] this_trait['hmac'] = hmac.data_hmac('{}:{}'.format(line[5], line[3])) this_trait['species'] = line[0] this_trait['group'] = line[1] this_trait['tissue'] = line[2] this_trait['symbol'] = line[6] if line[7]: this_trait['description'] = line[7].decode('utf-8', 'replace') else: this_trait['description'] = "N/A" this_trait['location_repr'] = 'N/A' if (line[8] != "NULL" and line[8] != "") and (line[9] != 0): this_trait['location_repr'] = 'Chr%s: %.6f' % (line[8], float(line[9])) try: this_trait['mean'] = '%.3f' % line[10] except: this_trait['mean'] = "N/A" this_trait['LRS_score_repr'] = "N/A" if line[11] != "" and line[11] != None: this_trait['LRS_score_repr'] = '%3.1f' % line[11] this_trait['additive'] = "N/A" if line[14] != "" and line[14] != None: this_trait['additive'] = '%.3f' % line[14] this_trait['dataset_id'] = line[15] this_trait['locus_chr'] = line[16] this_trait['locus_mb'] = line[17] dataset_ob = SimpleNamespace(id=this_trait["dataset_id"], type="ProbeSet",species=this_trait["species"]) if dataset_ob.id not in dataset_to_permissions: permissions = check_resource_availability(dataset_ob) dataset_to_permissions[dataset_ob.id] = permissions else: pemissions = dataset_to_permissions[dataset_ob.id] if "view" not in permissions['data']: continue max_lrs_text = "N/A" if this_trait['locus_chr'] != None and this_trait['locus_mb'] != None: max_lrs_text = "Chr" + str(this_trait['locus_chr']) + ": " + str(this_trait['locus_mb']) this_trait['max_lrs_text'] = max_lrs_text trait_list.append(this_trait) self.trait_count = len(trait_list) self.trait_list = json.dumps(trait_list) self.header_fields = ['Index', 'Record', 'Species', 'Group', 'Tissue', 'Dataset', 'Symbol', 'Description', 'Location', 'Mean', 'Max LRS', 'Max LRS Location', 'Additive Effect'] elif self.type == "phenotype": search_term = self.terms group_clause = "" if "_" in self.terms: if len(self.terms.split("_")[0]) == 3: search_term = self.terms.split("_")[1] group_clause = "AND InbredSet.`InbredSetCode` = '{}'".format(self.terms.split("_")[0]) sql = """ SELECT Species.`Name`, InbredSet.`Name`, PublishFreeze.`Name`, PublishFreeze.`FullName`, PublishXRef.`Id`, CAST(Phenotype.`Pre_publication_description` AS BINARY), CAST(Phenotype.`Post_publication_description` AS BINARY), Publication.`Authors`, Publication.`Year`, Publication.`PubMed_ID`, PublishXRef.`LRS`, PublishXRef.`additive`, InbredSet.`InbredSetCode`, PublishXRef.`mean` FROM Species,InbredSet,PublishFreeze,PublishXRef,Phenotype,Publication WHERE PublishXRef.`InbredSetId`=InbredSet.`Id` AND PublishFreeze.`InbredSetId`=InbredSet.`Id` AND InbredSet.`SpeciesId`=Species.`Id` {0} AND PublishXRef.`PhenotypeId`=Phenotype.`Id` AND PublishXRef.`PublicationId`=Publication.`Id` AND (Phenotype.Post_publication_description REGEXP "[[:<:]]{1}[[:>:]]" OR Phenotype.Pre_publication_description REGEXP "[[:<:]]{1}[[:>:]]" OR Phenotype.Pre_publication_abbreviation REGEXP "[[:<:]]{1}[[:>:]]" OR Phenotype.Post_publication_abbreviation REGEXP "[[:<:]]{1}[[:>:]]" OR Phenotype.Lab_code REGEXP "[[:<:]]{1}[[:>:]]" OR Publication.PubMed_ID REGEXP "[[:<:]]{1}[[:>:]]" OR Publication.Abstract REGEXP "[[:<:]]{1}[[:>:]]" OR Publication.Title REGEXP "[[:<:]]{1}[[:>:]]" OR Publication.Authors REGEXP "[[:<:]]{1}[[:>:]]" OR PublishXRef.Id REGEXP "[[:<:]]{1}[[:>:]]") ORDER BY Species.`Name`, InbredSet.`Name`, PublishXRef.`Id` LIMIT 6000 """.format(group_clause, search_term) logger.sql(sql) re = g.db.execute(sql).fetchall() trait_list = [] with Bench("Creating trait objects"): for i, line in enumerate(re): this_trait = {} this_trait['index'] = i + 1 this_trait['name'] = str(line[4]) if len(str(line[12])) == 3: this_trait['display_name'] = str(line[12]) + "_" + this_trait['name'] else: this_trait['display_name'] = this_trait['name'] this_trait['dataset'] = line[2] this_trait['dataset_fullname'] = line[3] this_trait['hmac'] = hmac.data_hmac('{}:{}'.format(line[4], line[2])) this_trait['species'] = line[0] this_trait['group'] = line[1] if line[9] != None and line[6] != None: this_trait['description'] = line[6].decode('utf-8', 'replace') elif line[5] != None: this_trait['description'] = line[5].decode('utf-8', 'replace') else: this_trait['description'] = "N/A" if line[13] != None and line[13] != "": this_trait['mean'] = line[13] else: this_trait['mean'] = "N/A" this_trait['authors'] = line[7] this_trait['year'] = line[8] if this_trait['year'].isdigit(): this_trait['pubmed_text'] = this_trait['year'] else: this_trait['pubmed_text'] = "N/A" if line[9] != "" and line[9] != None: this_trait['pubmed_link'] = webqtlConfig.PUBMEDLINK_URL % line[8] else: this_trait['pubmed_link'] = "N/A" if line[12]: this_trait['display_name'] = line[12] + "_" + str(this_trait['name']) this_trait['LRS_score_repr'] = "N/A" if line[10] != "" and line[10] != None: this_trait['LRS_score_repr'] = '%3.1f' % line[10] this_trait['additive'] = "N/A" if line[11] != "" and line[11] != None: this_trait['additive'] = '%.3f' % line[11] this_trait['max_lrs_text'] = "N/A" trait_ob = create_trait(dataset_name=this_trait['dataset'], name=this_trait['name'], get_qtl_info=True, get_sample_info=False) if not trait_ob: continue if this_trait['dataset'] == this_trait['group'] + "Publish": try: if trait_ob.locus_chr != "" and trait_ob.locus_mb != "": this_trait['max_lrs_text'] = "Chr" + str(trait_ob.locus_chr) + ": " + str(trait_ob.locus_mb) except: this_trait['max_lrs_text'] = "N/A" trait_list.append(this_trait) self.trait_count = len(trait_list) self.trait_list = json.dumps(trait_list) self.header_fields = ['Index', 'Species', 'Group', 'Record', 'Description', 'Authors', 'Year', 'Max LRS', 'Max LRS Location', 'Additive Effect']
def gen_search_result(self): """ Get the info displayed in the search result table from the set of results computed in the "search" function """ trait_list = [] json_trait_list = [] species = webqtlDatabaseFunction.retrieve_species( self.dataset.group.name) # result_set represents the results for each search term; a search of # "shh grin2b" would have two sets of results, one for each term logger.debug("self.results is:", pf(self.results)) for index, result in enumerate(self.results): if not result: continue #### Excel file needs to be generated #### trait_dict = {} trait_id = result[0] this_trait = create_trait(dataset=self.dataset, name=trait_id, get_qtl_info=True, get_sample_info=False) if this_trait: trait_dict['index'] = index + 1 trait_dict['name'] = this_trait.name if this_trait.dataset.type == "Publish": trait_dict['display_name'] = this_trait.display_name else: trait_dict['display_name'] = this_trait.name trait_dict['dataset'] = this_trait.dataset.name trait_dict['hmac'] = hmac.data_hmac('{}:{}'.format( this_trait.name, this_trait.dataset.name)) if this_trait.dataset.type == "ProbeSet": trait_dict['symbol'] = this_trait.symbol trait_dict['description'] = "N/A" if this_trait.description_display: trait_dict[ 'description'] = this_trait.description_display trait_dict['location'] = this_trait.location_repr trait_dict['mean'] = "N/A" trait_dict['additive'] = "N/A" if this_trait.mean != "" and this_trait.mean != None: trait_dict['mean'] = f"{this_trait.mean:.3f}" try: trait_dict[ 'lod_score'] = f"{float(this_trait.LRS_score_repr) / 4.61:.1f}" except: trait_dict['lod_score'] = "N/A" trait_dict['lrs_location'] = this_trait.LRS_location_repr if this_trait.additive != "": trait_dict['additive'] = f"{this_trait.additive:.3f}" elif this_trait.dataset.type == "Geno": trait_dict['location'] = this_trait.location_repr elif this_trait.dataset.type == "Publish": trait_dict['description'] = "N/A" if this_trait.description_display: trait_dict[ 'description'] = this_trait.description_display trait_dict['authors'] = this_trait.authors trait_dict['pubmed_id'] = "N/A" if this_trait.pubmed_id: trait_dict['pubmed_id'] = this_trait.pubmed_id trait_dict['pubmed_link'] = this_trait.pubmed_link trait_dict['pubmed_text'] = this_trait.pubmed_text trait_dict['mean'] = "N/A" if this_trait.mean != "" and this_trait.mean != None: trait_dict['mean'] = f"{this_trait.mean:.3f}" try: trait_dict[ 'lod_score'] = f"{float(this_trait.LRS_score_repr) / 4.61:.1f}" except: trait_dict['lod_score'] = "N/A" trait_dict['lrs_location'] = this_trait.LRS_location_repr trait_dict['additive'] = "N/A" if this_trait.additive != "": trait_dict['additive'] = f"{this_trait.additive:.3f}" # Convert any bytes in dict to a normal utf-8 string for key in trait_dict.keys(): if isinstance(trait_dict[key], bytes): trait_dict[key] = trait_dict[key].decode('utf-8') trait_list.append(trait_dict) self.trait_list = trait_list if self.dataset.type == "ProbeSet": self.header_data_names = [ 'index', 'display_name', 'symbol', 'description', 'location', 'mean', 'lrs_score', 'lrs_location', 'additive' ] elif self.dataset.type == "Publish": self.header_data_names = [ 'index', 'display_name', 'description', 'mean', 'authors', 'pubmed_text', 'lrs_score', 'lrs_location', 'additive' ] elif self.dataset.type == "Geno": self.header_data_names = ['index', 'display_name', 'location']