def view_collection():
    params = request.args

    uc_id = params['uc_id']
    uc = next((collection for collection in g.user_session.user_collections
               if collection["id"] == uc_id))
    traits = uc["members"]

    trait_obs = []
    json_version = []

    for atrait in traits:
        name, dataset_name = atrait.split(':')
        if dataset_name == "Temp":
            group = name.split("_")[2]
            dataset = create_dataset(dataset_name,
                                     dataset_type="Temp",
                                     group_name=group)
            trait_ob = create_trait(name=name, dataset=dataset)
        else:
            dataset = create_dataset(dataset_name)
            trait_ob = create_trait(name=name, dataset=dataset)
            trait_ob = retrieve_trait_info(trait_ob,
                                           dataset,
                                           get_qtl_info=True)
        trait_obs.append(trait_ob)

        json_version.append(jsonable(trait_ob))

    collection_info = dict(trait_obs=trait_obs, uc=uc)

    if "json" in params:
        return json.dumps(json_version)
    else:
        return render_template("collections/view.html", **collection_info)
Beispiel #2
0
def get_diff_of_vals(new_vals: Dict, trait_id: str) -> Dict:
    """ Get the diff between current sample values and the values in the DB

    Given a dict of the changed values and the trait/dataset ID, return a Dict
    with keys corresponding to each sample with a changed value and a value
    that is a dict with keys for the old_value and new_value

    """

    trait_name = trait_id.split(":")[0]
    dataset_name = trait_id.split(":")[1]
    trait_ob = create_trait(name=trait_name, dataset_name=dataset_name)

    old_vals = {
        sample: trait_ob.data[sample].value
        for sample in trait_ob.data
    }

    shared_samples = set.union(set(new_vals.keys()), set(old_vals.keys()))

    diff_dict = {}
    for sample in shared_samples:
        try:
            new_val = round(float(new_vals[sample]), 3)
        except:
            new_val = "x"
        try:
            old_val = round(float(old_vals[sample]), 3)
        except:
            old_val = "x"

        if new_val != old_val:
            diff_dict[sample] = {"new_val": new_val, "old_val": old_val}

    return diff_dict
Beispiel #3
0
def gen_covariates_file(this_dataset, covariates, samples):
    covariate_list = covariates.split(",")
    covariate_data_object = []
    for covariate in covariate_list:
        this_covariate_data = []
        trait_name = covariate.split(":")[0]
        dataset_ob = create_dataset(covariate.split(":")[1])
        trait_ob = create_trait(dataset=dataset_ob,
                                name=trait_name,
                                cellid=None)
        this_dataset.group.get_samplelist()
        trait_samples = this_dataset.group.samplelist
        trait_sample_data = trait_ob.data
        for index, sample in enumerate(trait_samples):
            if sample in samples:
                if sample in trait_sample_data:
                    sample_value = trait_sample_data[sample].value
                    this_covariate_data.append(sample_value)
                else:
                    this_covariate_data.append("-9")
        covariate_data_object.append(this_covariate_data)

    with open((f"{flat_files('mapping')}/"
               f"{this_dataset.group.name}_covariates.txt"), "w") as outfile:
        for i in range(len(covariate_data_object[0])):
            for this_covariate in covariate_data_object:
                outfile.write(str(this_covariate[i]) + "\t")
            outfile.write("\n")
 def get_trait_db_obs(self, trait_db_list):
     self.trait_list = []
     for i, trait_db in enumerate(trait_db_list):
         if i == (len(trait_db_list) - 1):
             break
         trait_name, dataset_name = trait_db.split(":")
         dataset_ob = data_set.create_dataset(dataset_name)
         trait_ob = create_trait(dataset=dataset_ob,
                                 name=trait_name,
                                 cellid=None)
         self.trait_list.append((trait_ob, dataset_ob))
def do_correlation(start_vars):
    assert ('db' in start_vars)
    assert ('target_db' in start_vars)
    assert ('trait_id' in start_vars)

    this_dataset = data_set.create_dataset(dataset_name=start_vars['db'])
    target_dataset = data_set.create_dataset(
        dataset_name=start_vars['target_db'])
    this_trait = create_trait(dataset=this_dataset,
                              name=start_vars['trait_id'])
    this_trait = retrieve_sample_data(this_trait, this_dataset)

    corr_params = init_corr_params(start_vars)

    corr_results = calculate_results(this_trait, this_dataset, target_dataset,
                                     corr_params)
    #corr_results = collections.OrderedDict(sorted(corr_results.items(), key=lambda t: -abs(t[1][0])))

    final_results = []
    for _trait_counter, trait in enumerate(
            list(corr_results.keys())[:corr_params['return_count']]):
        if corr_params['type'] == "tissue":
            [sample_r, num_overlap, sample_p, symbol] = corr_results[trait]
            result_dict = {
                "trait": trait,
                "sample_r": sample_r,
                "#_strains": num_overlap,
                "p_value": sample_p,
                "symbol": symbol
            }
        elif corr_params['type'] == "literature" or corr_params[
                'type'] == "lit":
            [gene_id, sample_r] = corr_results[trait]
            result_dict = {
                "trait": trait,
                "sample_r": sample_r,
                "gene_id": gene_id
            }
        else:
            [sample_r, sample_p, num_overlap] = corr_results[trait]
            result_dict = {
                "trait": trait,
                "sample_r": sample_r,
                "#_strains": num_overlap,
                "p_value": sample_p
            }

        final_results.append(result_dict)

    # json_corr_results = generate_corr_json(final_corr_results, this_trait, this_dataset, target_dataset, for_api = True)

    return final_results
Beispiel #6
0
def create_target_this_trait(start_vars):
    """this function creates the required trait and target dataset for correlation"""

    if start_vars['dataset'] == "Temp":
        this_dataset = data_set.create_dataset(dataset_name="Temp",
                                               dataset_type="Temp",
                                               group_name=start_vars['group'])
    else:
        this_dataset = data_set.create_dataset(
            dataset_name=start_vars['dataset'])
    target_dataset = data_set.create_dataset(
        dataset_name=start_vars['corr_dataset'])
    this_trait = create_trait(dataset=this_dataset,
                              name=start_vars['trait_id'])
    sample_data = ()
    return (this_dataset, this_trait, target_dataset, sample_data)
Beispiel #7
0
def get_export_metadata(trait_id, dataset_name):
    dataset = data_set.create_dataset(dataset_name)
    this_trait = create_trait(dataset=dataset,
                              name=trait_id,
                              cellid=None,
                              get_qtl_info=False)

    metadata = []
    if dataset.type == "Publish":
        metadata.append(["Phenotype ID: " + trait_id])
        metadata.append([
            "Phenotype URL: " + "http://genenetwork.org/show_trait?trait_id=" +
            trait_id + "&dataset=" + dataset_name
        ])
        metadata.append(["Group: " + dataset.group.name])
        metadata.append([
            "Phenotype: " +
            this_trait.description_display.replace(",", "\",\"")
        ])
        metadata.append([
            "Authors: " + (this_trait.authors if this_trait.authors else "N/A")
        ])
        metadata.append(
            ["Title: " + (this_trait.title if this_trait.title else "N/A")])
        metadata.append([
            "Journal: " + (this_trait.journal if this_trait.journal else "N/A")
        ])
        metadata.append([
            "Dataset Link: http://gn1.genenetwork.org/webqtl/main.py?FormID=sharinginfo&InfoPageName="
            + dataset.name
        ])
    else:
        metadata.append(["Record ID: " + trait_id])
        metadata.append([
            "Trait URL: " + "http://genenetwork.org/show_trait?trait_id=" +
            trait_id + "&dataset=" + dataset_name
        ])
        if this_trait.symbol:
            metadata.append(["Symbol: " + this_trait.symbol])
        metadata.append(["Dataset: " + dataset.name])
        metadata.append(["Group: " + dataset.group.name])

    metadata.append([])

    return metadata
def set_template_vars(start_vars, correlation_data):
    corr_type = start_vars['corr_type']
    corr_method = start_vars['corr_sample_method']

    if start_vars['dataset'] == "Temp":
        this_dataset_ob = create_dataset(dataset_name="Temp",
                                         dataset_type="Temp",
                                         group_name=start_vars['group'])
    else:
        this_dataset_ob = create_dataset(dataset_name=start_vars['dataset'])
    this_trait = create_trait(dataset=this_dataset_ob,
                              name=start_vars['trait_id'])

    correlation_data['this_trait'] = jsonable(this_trait, this_dataset_ob)
    correlation_data['this_dataset'] = this_dataset_ob.as_dict()

    target_dataset_ob = create_dataset(correlation_data['target_dataset'])
    correlation_data['target_dataset'] = target_dataset_ob.as_dict()

    table_json = correlation_json_for_table(correlation_data,
                                            correlation_data['this_trait'],
                                            correlation_data['this_dataset'],
                                            target_dataset_ob)

    correlation_data['table_json'] = table_json

    if target_dataset_ob.type == "ProbeSet":
        filter_cols = [7, 6]
    elif target_dataset_ob.type == "Publish":
        filter_cols = [6, 0]
    else:
        filter_cols = [4, 0]

    correlation_data['corr_method'] = corr_method
    correlation_data['filter_cols'] = filter_cols
    correlation_data['header_fields'] = get_header_fields(
        target_dataset_ob.type, correlation_data['corr_method'])
    correlation_data['formatted_corr_type'] = get_formatted_corr_type(
        corr_type, corr_method)

    return correlation_data
Beispiel #9
0
def gen_covariates_file(this_dataset, covariates, samples):
    covariate_list = covariates.split(",")
    covariate_data_object = []
    for covariate in covariate_list:
        this_covariate_data = []
        trait_name = covariate.split(":")[0]
        dataset_name = covariate.split(":")[1]
        if dataset_name == "Temp":
            temp_group = trait_name.split("_")[2]
            dataset_ob = create_dataset(dataset_name="Temp",
                                        dataset_type="Temp",
                                        group_name=temp_group)
        else:
            dataset_ob = create_dataset(covariate.split(":")[1])
        trait_ob = create_trait(dataset=dataset_ob,
                                name=trait_name,
                                cellid=None)
        this_dataset.group.get_samplelist()
        trait_samples = this_dataset.group.samplelist
        trait_sample_data = trait_ob.data
        for index, sample in enumerate(trait_samples):
            if sample in samples:
                if sample in trait_sample_data:
                    sample_value = trait_sample_data[sample].value
                    this_covariate_data.append(sample_value)
                else:
                    this_covariate_data.append("-9")
        covariate_data_object.append(this_covariate_data)

    filename = "COVAR_" + generate_hash_of_string(
        this_dataset.name + str(covariate_data_object)).replace("/", "_")

    with open((f"{flat_files('mapping')}/" f"{filename}.txt"), "w") as outfile:
        for i in range(len(covariate_data_object[0])):
            for this_covariate in covariate_data_object:
                outfile.write(str(this_covariate[i]) + "\t")
            outfile.write("\n")

    return filename
def cofactors_to_dict(cofactors: str, dataset_ob, samples) -> Dict:
    """Given a string of cofactors, the trait being mapped's dataset ob,
    and list of samples, return cofactor data as a Dict

    """
    cofactor_dict = {}
    if cofactors:
        dataset_ob.group.get_samplelist()
        sample_list = dataset_ob.group.samplelist
        for cofactor in cofactors.split(","):
            cofactor_name, cofactor_dataset = cofactor.split(":")
            if cofactor_dataset == dataset_ob.name:
                cofactor_dict[cofactor_name] = []
                trait_ob = create_trait(dataset=dataset_ob, name=cofactor_name)
                sample_data = trait_ob.data
                for index, sample in enumerate(samples):
                    if sample in sample_data:
                        sample_value = str(
                            round(float(sample_data[sample].value), 3))
                        cofactor_dict[cofactor_name].append(sample_value)
                    else:
                        cofactor_dict[cofactor_name].append("NA")
    return cofactor_dict
    def __init__(self, params):
        if "Temp" in params['dataset_1']:
            self.dataset_1 = data_set.create_dataset(dataset_name = "Temp", dataset_type = "Temp", group_name = params['dataset_1'].split("_")[1])
        else:
            self.dataset_1 = data_set.create_dataset(params['dataset_1'])
        if "Temp" in params['dataset_2']:
            self.dataset_2 = data_set.create_dataset(dataset_name = "Temp", dataset_type = "Temp", group_name = params['dataset_2'].split("_")[1])
        else:
            self.dataset_2 = data_set.create_dataset(params['dataset_2'])

        #self.dataset_3 = data_set.create_dataset(params['dataset_3'])
        self.trait_1 = create_trait(name=params['trait_1'], dataset=self.dataset_1)
        self.trait_2 = create_trait(name=params['trait_2'], dataset=self.dataset_2)
        #self.trait_3 = create_trait(name=params['trait_3'], dataset=self.dataset_3)

        self.method = params['method']

        primary_samples = self.dataset_1.group.samplelist
        if self.dataset_1.group.parlist != None:
            primary_samples += self.dataset_1.group.parlist
        if self.dataset_1.group.f1list != None:
            primary_samples += self.dataset_1.group.f1list

        self.trait_1 = retrieve_sample_data(self.trait_1, self.dataset_1, primary_samples)
        self.trait_2 = retrieve_sample_data(self.trait_2, self.dataset_2, primary_samples)

        samples_1, samples_2, num_overlap = corr_result_helpers.normalize_values_with_samples(self.trait_1.data, self.trait_2.data)

        self.data = []
        self.indIDs = list(samples_1.keys())
        vals_1 = []
        for sample in list(samples_1.keys()):
            vals_1.append(samples_1[sample].value)
        self.data.append(vals_1)
        vals_2 = []
        for sample in list(samples_2.keys()):
            vals_2.append(samples_2[sample].value)
        self.data.append(vals_2)

        slope, intercept, r_value, p_value, std_err = stats.linregress(vals_1, vals_2)

        if slope < 0.001:
            slope_string = '%.3E' % slope
        else:
            slope_string = '%.3f' % slope
        
        x_buffer = (max(vals_1) - min(vals_1))*0.1
        y_buffer = (max(vals_2) - min(vals_2))*0.1

        x_range = [min(vals_1) - x_buffer, max(vals_1) + x_buffer]
        y_range = [min(vals_2) - y_buffer, max(vals_2) + y_buffer]

        intercept_coords = get_intercept_coords(slope, intercept, x_range, y_range)

        rx = stats.rankdata(vals_1)
        ry = stats.rankdata(vals_2)
        self.rdata = []
        self.rdata.append(rx.tolist())
        self.rdata.append(ry.tolist())        
        srslope, srintercept, srr_value, srp_value, srstd_err = stats.linregress(rx, ry)

        if srslope < 0.001:
            srslope_string = '%.3E' % srslope
        else:
            srslope_string = '%.3f' % srslope

        x_buffer = (max(rx) - min(rx))*0.1
        y_buffer = (max(ry) - min(ry))*0.1

        sr_range = [min(rx) - x_buffer, max(rx) + x_buffer]

        sr_intercept_coords = get_intercept_coords(srslope, srintercept, sr_range, sr_range)

        self.collections_exist = "False"
        if g.user_session.num_collections > 0:
            self.collections_exist = "True"

        self.js_data = dict(
            data = self.data,
            rdata = self.rdata,
            indIDs = self.indIDs,
            trait_1 = self.trait_1.dataset.name + ": " + str(self.trait_1.name),
            trait_2 = self.trait_2.dataset.name + ": " + str(self.trait_2.name),
            samples_1 = samples_1,
            samples_2 = samples_2,
            num_overlap = num_overlap,
            vals_1 = vals_1,
            vals_2 = vals_2,
            x_range = x_range,
            y_range = y_range,
            sr_range = sr_range,
            intercept_coords = intercept_coords,
            sr_intercept_coords = sr_intercept_coords,

            slope = slope,
            slope_string = slope_string,
            intercept = intercept,
            r_value = r_value,
            p_value = p_value,

            srslope = srslope,
            srslope_string = srslope_string,
            srintercept = srintercept,
            srr_value = srr_value,
            srp_value = srp_value

            #trait3 = self.trait_3.data,
            #vals_3 = vals_3
        )
        self.jsdata = self.js_data
Beispiel #12
0
    def __init__(self, user_id, kw):
        if 'trait_id' in kw and kw['dataset'] != "Temp":
            self.temp_trait = False
            self.trait_id = kw['trait_id']
            helper_functions.get_species_dataset_trait(self, kw)
        elif 'group' in kw:
            self.temp_trait = True
            self.trait_id = "Temp_" + kw['species'] + "_" + kw['group'] + \
                "_" + datetime.datetime.now().strftime("%m%d%H%M%S")
            self.temp_species = kw['species']
            self.temp_group = kw['group']
            self.dataset = data_set.create_dataset(dataset_name="Temp",
                                                   dataset_type="Temp",
                                                   group_name=self.temp_group)

            # Put values in Redis so they can be looked up later if
            # added to a collection
            Redis.set(self.trait_id, kw['trait_paste'], ex=ONE_YEAR)
            self.trait_vals = kw['trait_paste'].split()
            self.this_trait = create_trait(dataset=self.dataset,
                                           name=self.trait_id,
                                           cellid=None)
        else:
            self.temp_trait = True
            self.trait_id = kw['trait_id']
            self.temp_species = self.trait_id.split("_")[1]
            self.temp_group = self.trait_id.split("_")[2]
            self.dataset = data_set.create_dataset(dataset_name="Temp",
                                                   dataset_type="Temp",
                                                   group_name=self.temp_group)
            self.this_trait = create_trait(dataset=self.dataset,
                                           name=self.trait_id,
                                           cellid=None)
            self.trait_vals = Redis.get(self.trait_id).split()
        self.resource_id = get_resource_id(self.dataset, self.trait_id)
        self.admin_status = get_highest_user_access_role(
            user_id=user_id,
            resource_id=(self.resource_id or ""),
            gn_proxy_url=GN_PROXY_URL)
        # ZS: Get verify/rna-seq link URLs
        try:
            blatsequence = self.this_trait.blatseq
            if not blatsequence:
                # XZ, 06/03/2009: ProbeSet name is not unique among platforms. We should use ProbeSet Id instead.
                query1 = """SELECT Probe.Sequence, Probe.Name
                            FROM Probe, ProbeSet, ProbeSetFreeze, ProbeSetXRef
                            WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND
                                    ProbeSetXRef.ProbeSetId = ProbeSet.Id AND
                                    ProbeSetFreeze.Name = '%s' AND
                                    ProbeSet.Name = '%s' AND
                                    Probe.ProbeSetId = ProbeSet.Id order by Probe.SerialOrder""" % (
                    self.this_trait.dataset.name, self.this_trait.name)
                seqs = g.db.execute(query1).fetchall()
                if not seqs:
                    raise ValueError
                else:
                    blatsequence = ''
                    for seqt in seqs:
                        if int(seqt[1][-1]) % 2 == 1:
                            blatsequence += string.strip(seqt[0])

            # --------Hongqiang add this part in order to not only blat ProbeSet, but also blat Probe
            blatsequence = '%3E' + self.this_trait.name + '%0A' + blatsequence + '%0A'

            # XZ, 06/03/2009: ProbeSet name is not unique among platforms. We should use ProbeSet Id instead.
            query2 = """SELECT Probe.Sequence, Probe.Name
                        FROM Probe, ProbeSet, ProbeSetFreeze, ProbeSetXRef
                        WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND
                                ProbeSetXRef.ProbeSetId = ProbeSet.Id AND
                                ProbeSetFreeze.Name = '%s' AND
                                ProbeSet.Name = '%s' AND
                                Probe.ProbeSetId = ProbeSet.Id order by Probe.SerialOrder""" % (
                self.this_trait.dataset.name, self.this_trait.name)

            seqs = g.db.execute(query2).fetchall()
            for seqt in seqs:
                if int(seqt[1][-1]) % 2 == 1:
                    blatsequence += '%3EProbe_' + \
                        seqt[1].strip() + '%0A' + seqt[0].strip() + '%0A'

            if self.dataset.group.species == "rat":
                self.UCSC_BLAT_URL = webqtlConfig.UCSC_BLAT % ('rat', 'rn6',
                                                               blatsequence)
                self.UTHSC_BLAT_URL = ""
            elif self.dataset.group.species == "mouse":
                self.UCSC_BLAT_URL = webqtlConfig.UCSC_BLAT % ('mouse', 'mm10',
                                                               blatsequence)
                self.UTHSC_BLAT_URL = webqtlConfig.UTHSC_BLAT % (
                    'mouse', 'mm10', blatsequence)
            elif self.dataset.group.species == "human":
                self.UCSC_BLAT_URL = webqtlConfig.UCSC_BLAT % ('human', 'hg38',
                                                               blatsequence)
                self.UTHSC_BLAT_URL = ""
            else:
                self.UCSC_BLAT_URL = ""
                self.UTHSC_BLAT_URL = ""
        except:
            self.UCSC_BLAT_URL = ""
            self.UTHSC_BLAT_URL = ""

        if self.dataset.type == "ProbeSet":
            self.show_probes = "True"

        trait_units = get_trait_units(self.this_trait)
        self.get_external_links()
        self.build_correlation_tools()

        self.ncbi_summary = get_ncbi_summary(self.this_trait)

        # Get nearest marker for composite mapping
        if not self.temp_trait:
            if check_if_attr_exists(
                    self.this_trait, 'locus_chr'
            ) and self.dataset.type != "Geno" and self.dataset.type != "Publish":
                self.nearest_marker = get_nearest_marker(
                    self.this_trait, self.dataset)
            else:
                self.nearest_marker = ""

        self.make_sample_lists()

        trait_vals_by_group = []
        for sample_type in self.sample_groups:
            trait_vals_by_group.append(get_trait_vals(sample_type.sample_list))

        self.max_digits_by_group = get_max_digits(trait_vals_by_group)

        self.qnorm_vals = quantile_normalize_vals(self.sample_groups,
                                                  trait_vals_by_group)
        self.z_scores = get_z_scores(self.sample_groups, trait_vals_by_group)

        self.temp_uuid = uuid.uuid4()

        self.sample_group_types = OrderedDict()
        if len(self.sample_groups) > 1:
            self.sample_group_types[
                'samples_primary'] = self.dataset.group.name
            self.sample_group_types['samples_other'] = "Other"
            self.sample_group_types['samples_all'] = "All"
        else:
            self.sample_group_types[
                'samples_primary'] = self.dataset.group.name
        sample_lists = [group.sample_list for group in self.sample_groups]

        self.categorical_var_list = []
        self.numerical_var_list = []
        if not self.temp_trait:
            # ZS: Only using first samplelist, since I think mapping only uses those samples
            self.categorical_var_list = get_categorical_variables(
                self.this_trait, self.sample_groups[0])
            self.numerical_var_list = get_numerical_variables(
                self.this_trait, self.sample_groups[0])

        # ZS: Get list of chromosomes to select for mapping
        self.chr_list = [["All", -1]]
        for i, this_chr in enumerate(
                self.dataset.species.chromosomes.chromosomes):
            self.chr_list.append([
                self.dataset.species.chromosomes.chromosomes[this_chr].name, i
            ])

        self.genofiles = self.dataset.group.get_genofiles()
        study_samplelist_json = self.dataset.group.get_study_samplelists()
        self.study_samplelists = [
            study["title"] for study in study_samplelist_json
        ]

        # ZS: No need to grab scales from .geno file unless it's using
        # a mapping method that reads .geno files
        if "QTLReaper" or "R/qtl" in dataset.group.mapping_names:
            if self.genofiles:
                self.scales_in_geno = get_genotype_scales(self.genofiles)
            else:
                self.scales_in_geno = get_genotype_scales(
                    self.dataset.group.name + ".geno")
        else:
            self.scales_in_geno = {}

        self.has_num_cases = has_num_cases(self.this_trait)

        # ZS: Needed to know whether to display bar chart + get max
        # sample name length in order to set table column width
        self.num_values = 0
        # ZS: So it knows whether to display the Binary R/qtl mapping
        # method, which doesn't work unless all values are 0 or 1
        self.binary = "true"
        # ZS: Since we don't want to show log2 transform option for
        # situations where it doesn't make sense
        self.negative_vals_exist = "false"
        max_samplename_width = 1
        for group in self.sample_groups:
            for sample in group.sample_list:
                if len(sample.name) > max_samplename_width:
                    max_samplename_width = len(sample.name)
                if sample.display_value != "x":
                    self.num_values += 1
                    if sample.display_value != 0 or sample.display_value != 1:
                        self.binary = "false"
                    if sample.value < 0:
                        self.negative_vals_exist = "true"

        # ZS: Check whether any attributes have few enough distinct
        # values to show the "Block samples by group" option
        self.categorical_attr_exists = "false"
        for attribute in self.sample_groups[0].attributes:
            if len(self.sample_groups[0].attributes[attribute].distinct_values
                   ) <= 10:
                self.categorical_attr_exists = "true"
                break

        sample_column_width = max_samplename_width * 8

        self.stats_table_width, self.trait_table_width = get_table_widths(
            self.sample_groups, sample_column_width, self.has_num_cases)

        if self.num_values >= 5000:
            self.maf = 0.01
        else:
            self.maf = 0.05

        trait_symbol = None
        short_description = None
        if not self.temp_trait:
            if self.this_trait.symbol:
                trait_symbol = self.this_trait.symbol
                short_description = trait_symbol

            elif hasattr(self.this_trait, 'post_publication_abbreviation'):
                short_description = self.this_trait.post_publication_abbreviation

            elif hasattr(self.this_trait, 'pre_publication_abbreviation'):
                short_description = self.this_trait.pre_publication_abbreviation

        # Todo: Add back in the ones we actually need from below, as we discover we need them
        hddn = OrderedDict()

        if self.dataset.group.allsamples:
            hddn['allsamples'] = ','.join(self.dataset.group.allsamples)
        hddn['primary_samples'] = ','.join(self.primary_sample_names)
        hddn['trait_id'] = self.trait_id
        hddn['trait_display_name'] = self.this_trait.display_name
        hddn['dataset'] = self.dataset.name
        hddn['temp_trait'] = False
        if self.temp_trait:
            hddn['temp_trait'] = True
            hddn['group'] = self.temp_group
            hddn['species'] = self.temp_species
        else:
            hddn['group'] = self.dataset.group.name
            hddn['species'] = self.dataset.group.species
        hddn['use_outliers'] = False
        hddn['method'] = "gemma"
        hddn['selected_chr'] = -1
        hddn['mapping_display_all'] = True
        hddn['suggestive'] = 0
        hddn['study_samplelists'] = json.dumps(study_samplelist_json)
        hddn['num_perm'] = 0
        hddn['categorical_vars'] = ""
        if self.categorical_var_list:
            hddn['categorical_vars'] = ",".join(self.categorical_var_list)
        hddn['manhattan_plot'] = ""
        hddn['control_marker'] = ""
        if not self.temp_trait:
            if hasattr(
                    self.this_trait, 'locus_chr'
            ) and self.this_trait.locus_chr != "" and self.dataset.type != "Geno" and self.dataset.type != "Publish":
                hddn['control_marker'] = self.nearest_marker
        hddn['do_control'] = False
        hddn['maf'] = 0.05
        hddn['mapping_scale'] = "physic"
        hddn['compare_traits'] = []
        hddn['export_data'] = ""
        hddn['export_format'] = "excel"
        if len(self.scales_in_geno) < 2 and bool(self.scales_in_geno):
            hddn['mapping_scale'] = self.scales_in_geno[list(
                self.scales_in_geno.keys())[0]][0][0]

        # We'll need access to this_trait and hddn in the Jinja2
        # Template, so we put it inside self
        self.hddn = hddn

        js_data = dict(trait_id=self.trait_id,
                       trait_symbol=trait_symbol,
                       max_digits=self.max_digits_by_group,
                       short_description=short_description,
                       unit_type=trait_units,
                       dataset_type=self.dataset.type,
                       species=self.dataset.group.species,
                       scales_in_geno=self.scales_in_geno,
                       data_scale=self.dataset.data_scale,
                       sample_group_types=self.sample_group_types,
                       sample_lists=sample_lists,
                       se_exists=self.sample_groups[0].se_exists,
                       has_num_cases=self.has_num_cases,
                       attributes=self.sample_groups[0].attributes,
                       categorical_attr_exists=self.categorical_attr_exists,
                       categorical_vars=",".join(self.categorical_var_list),
                       num_values=self.num_values,
                       qnorm_values=self.qnorm_vals,
                       zscore_values=self.z_scores,
                       sample_column_width=sample_column_width,
                       temp_uuid=self.temp_uuid)
        self.js_data = js_data
Beispiel #13
0
    def run_analysis(self, requestform):
        logger.info("Starting CTL analysis on dataset")
        self.trait_db_list = [
            trait.strip() for trait in requestform['trait_list'].split(',')
        ]
        self.trait_db_list = [x for x in self.trait_db_list if x]

        logger.debug("strategy:", requestform.get("strategy"))
        strategy = requestform.get("strategy")

        logger.debug("nperm:", requestform.get("nperm"))
        nperm = int(requestform.get("nperm"))

        logger.debug("parametric:", requestform.get("parametric"))
        parametric = bool(requestform.get("parametric"))

        logger.debug("significance:", requestform.get("significance"))
        significance = float(requestform.get("significance"))

        # Get the name of the .geno file belonging to the first phenotype
        datasetname = self.trait_db_list[0].split(":")[1]
        dataset = data_set.create_dataset(datasetname)

        genofilelocation = locate(dataset.group.name + ".geno", "genotype")
        parser = genofile_parser.ConvertGenoFile(genofilelocation)
        parser.process_csv()
        logger.debug("dataset group: ", dataset.group)
        # Create a genotype matrix
        individuals = parser.individuals
        markers = []
        markernames = []
        for marker in parser.markers:
            markernames.append(marker["name"])
            markers.append(marker["genotypes"])

        genotypes = list(itertools.chain(*markers))
        logger.debug(
            len(genotypes) / len(individuals), "==", len(parser.markers))

        rGeno = r_t(
            ro.r.matrix(r_unlist(genotypes),
                        nrow=len(markernames),
                        ncol=len(individuals),
                        dimnames=r_list(markernames, individuals),
                        byrow=True))

        # Create a phenotype matrix
        traits = []
        for trait in self.trait_db_list:
            logger.debug("retrieving data for", trait)
            if trait != "":
                ts = trait.split(':')
                gt = create_trait(name=ts[0], dataset_name=ts[1])
                gt = retrieve_sample_data(gt, dataset, individuals)
                for ind in individuals:
                    if ind in list(gt.data.keys()):
                        traits.append(gt.data[ind].value)
                    else:
                        traits.append("-999")

        rPheno = r_t(
            ro.r.matrix(r_as_numeric(r_unlist(traits)),
                        nrow=len(self.trait_db_list),
                        ncol=len(individuals),
                        dimnames=r_list(self.trait_db_list, individuals),
                        byrow=True))

        logger.debug(rPheno)

        # Use a data frame to store the objects
        rPheno = r_data_frame(rPheno, check_names=False)
        rGeno = r_data_frame(rGeno, check_names=False)

        # Debug: Print the genotype and phenotype files to disk
        #r_write_table(rGeno, "~/outputGN/geno.csv")
        #r_write_table(rPheno, "~/outputGN/pheno.csv")

        # Perform the CTL scan
        res = self.r_CTLscan(rGeno,
                             rPheno,
                             strategy=strategy,
                             nperm=nperm,
                             parametric=parametric,
                             nthreads=6)

        # Get significant interactions
        significant = self.r_CTLsignificant(res, significance=significance)

        # Create an image for output
        self.results = {}
        self.results['imgurl1'] = webqtlUtil.genRandStr("CTLline_") + ".png"
        self.results['imgloc1'] = GENERATED_IMAGE_DIR + self.results['imgurl1']

        self.results['ctlresult'] = significant
        self.results[
            'requestform'] = requestform  # Store the user specified parameters for the output page

        # Create the lineplot
        r_png(self.results['imgloc1'],
              width=1000,
              height=600,
              type='cairo-png')
        self.r_lineplot(res, significance=significance)
        r_dev_off()

        n = 2  # We start from 2, since R starts from 1 :)
        for trait in self.trait_db_list:
            # Create the QTL like CTL plots
            self.results['imgurl' +
                         str(n)] = webqtlUtil.genRandStr("CTL_") + ".png"
            self.results[
                'imgloc' +
                str(n)] = GENERATED_IMAGE_DIR + self.results['imgurl' + str(n)]
            r_png(self.results['imgloc' + str(n)],
                  width=1000,
                  height=600,
                  type='cairo-png')
            self.r_plotCTLobject(res, (n - 1),
                                 significance=significance,
                                 main='Phenotype ' + trait)
            r_dev_off()
            n = n + 1

        # Flush any output from R
        sys.stdout.flush()

        # Create the interactive graph for cytoscape visualization (Nodes and Edges)
        if not isinstance(significant, ri.RNULLType):
            for x in range(len(significant[0])):
                logger.debug(significant[0][x], significant[1][x],
                             significant[2][x])  # Debug to console
                tsS = significant[0][x].split(':')  # Source
                tsT = significant[2][x].split(':')  # Target
                gtS = create_trait(
                    name=tsS[0],
                    dataset_name=tsS[1])  # Retrieve Source info from the DB
                gtT = create_trait(
                    name=tsT[0],
                    dataset_name=tsT[1])  # Retrieve Target info from the DB
                self.addNode(gtS)
                self.addNode(gtT)
                self.addEdge(gtS, gtT, significant, x)

                significant[0][x] = "{} ({})".format(
                    gtS.symbol,
                    gtS.name)  # Update the trait name for the displayed table
                significant[2][x] = "{} ({})".format(
                    gtT.symbol,
                    gtT.name)  # Update the trait name for the displayed table

        self.elements = json.dumps(self.nodes_list + self.edges_list)
def do_mapping_for_api(start_vars):
    assert ('db' in start_vars)
    assert ('trait_id' in start_vars)

    dataset = data_set.create_dataset(dataset_name=start_vars['db'])
    dataset.group.get_markers()
    this_trait = create_trait(dataset=dataset, name=start_vars['trait_id'])
    this_trait = retrieve_sample_data(this_trait, dataset)

    samples = []
    vals = []

    for sample in dataset.group.samplelist:
        in_trait_data = False
        for item in this_trait.data:
            if this_trait.data[item].name == sample:
                value = str(this_trait.data[item].value)
                samples.append(item)
                vals.append(value)
                in_trait_data = True
                break
        if not in_trait_data:
            vals.append("x")

    mapping_params = initialize_parameters(start_vars, dataset, this_trait)

    covariates = ""  #ZS: It seems to take an empty string as default. This should probably be changed.

    if mapping_params['mapping_method'] == "gemma":
        header_row = ["name", "chr", "Mb", "lod_score", "p_value"]
        if mapping_params[
                'use_loco'] == "True":  #ZS: gemma_mapping returns both results and the filename for LOCO, so need to only grab the former for api
            result_markers = gemma_mapping.run_gemma(
                this_trait, dataset, samples, vals, covariates,
                mapping_params['use_loco'], mapping_params['maf'])[0]
        else:
            result_markers = gemma_mapping.run_gemma(
                this_trait, dataset, samples, vals, covariates,
                mapping_params['use_loco'], mapping_params['maf'])
    elif mapping_params['mapping_method'] == "rqtl":
        header_row = ["name", "chr", "cM", "lod_score"]
        if mapping_params['num_perm'] > 0:
            _sperm_output, _suggestive, _significant, result_markers = rqtl_mapping.run_rqtl_geno(
                vals, dataset, mapping_params['rqtl_method'],
                mapping_params['rqtl_model'], mapping_params['perm_check'],
                mapping_params['num_perm'], mapping_params['do_control'],
                mapping_params['control_marker'],
                mapping_params['manhattan_plot'], mapping_params['pair_scan'])
        else:
            result_markers = rqtl_mapping.run_rqtl_geno(
                vals, dataset, mapping_params['rqtl_method'],
                mapping_params['rqtl_model'], mapping_params['perm_check'],
                mapping_params['num_perm'], mapping_params['do_control'],
                mapping_params['control_marker'],
                mapping_params['manhattan_plot'], mapping_params['pair_scan'])

    if mapping_params['limit_to']:
        result_markers = result_markers[:mapping_params['limit_to']]

    if mapping_params['format'] == "csv":
        output_rows = []
        output_rows.append(header_row)
        for marker in result_markers:
            this_row = [marker[header] for header in header_row]
            output_rows.append(this_row)

        return output_rows, mapping_params['format']
    elif mapping_params['format'] == "json":
        return result_markers, mapping_params['format']
    else:
        return result_markers, None
Beispiel #15
0
def add_cofactors(cross, this_dataset, covariates, samples):
    ro.numpy2ri.activate()

    covariate_list = covariates.split(",")
    covar_name_string = "c("
    for i, covariate in enumerate(covariate_list):
        logger.info("Covariate: " + covariate)
        this_covar_data = []
        covar_as_string = "c("
        trait_name = covariate.split(":")[0]
        dataset_ob = create_dataset(covariate.split(":")[1])
        trait_ob = create_trait(dataset=dataset_ob,
                                name=trait_name,
                                cellid=None)

        this_dataset.group.get_samplelist()
        trait_samples = this_dataset.group.samplelist
        trait_sample_data = trait_ob.data
        for index, sample in enumerate(samples):
            if sample in trait_samples:
                if sample in trait_sample_data:
                    sample_value = trait_sample_data[sample].value
                    this_covar_data.append(sample_value)
                else:
                    this_covar_data.append("NA")

        for j, item in enumerate(this_covar_data):
            if j < (len(this_covar_data) - 1):
                covar_as_string += str(item) + ","
            else:
                covar_as_string += str(item)

        covar_as_string += ")"

        datatype = get_trait_data_type(covariate)
        logger.info("Covariate: " + covariate + " is of type: " + datatype)
        if (datatype == "categorical"):  # Cat variable
            logger.info("call of add_categorical_covar")
            cross, col_names = add_categorical_covar(
                cross, covar_as_string, i)  # Expand and add it to the cross
            logger.info("add_categorical_covar returned")
            for z, col_name in enumerate(
                    col_names):  # Go through the additional covar names
                if i < (len(covariate_list) - 1):
                    covar_name_string += '"' + col_name + '", '
                else:
                    if (z < (len(col_names) - 1)):
                        covar_name_string += '"' + col_name + '", '
                    else:
                        covar_name_string += '"' + col_name + '"'
        else:
            col_name = "covar_" + str(i)
            cross = add_phenotype(cross, covar_as_string, col_name)
            if i < (len(covariate_list) - 1):
                covar_name_string += '"' + col_name + '", '
            else:
                covar_name_string += '"' + col_name + '"'

    covar_name_string += ")"
    covars_ob = pull_var("trait_covars", cross, covar_name_string)
    return cross, covars_ob
def export_search_results_csv(targs):

    table_data = json.loads(targs['export_data'])
    table_rows = table_data['rows']

    now = datetime.datetime.now()
    time_str = now.strftime('%H:%M_%d%B%Y')
    if 'file_name' in targs:
        zip_file_name = targs['file_name'] + "_export_" + time_str
    else:
        zip_file_name = "export_" + time_str

    metadata = []

    if 'database_name' in targs:
        if targs['database_name'] != "None":
            metadata.append(["Data Set: " + targs['database_name']])
    if 'accession_id' in targs:
        if targs['accession_id'] != "None":
            metadata.append([
                "Metadata Link: http://genenetwork.org/webqtl/main.py?FormID=sharinginfo&GN_AccessionId="
                + targs['accession_id']
            ])
    metadata.append(
        ["Export Date: " + datetime.datetime.now().strftime("%B %d, %Y")])
    metadata.append(
        ["Export Time: " + datetime.datetime.now().strftime("%H:%M GMT")])
    if 'search_string' in targs:
        if targs['search_string'] != "None":
            metadata.append(["Search Query: " + targs['search_string']])
    if 'filter_term' in targs:
        if targs['filter_term'] != "None":
            metadata.append(["Search Filter Terms: " + targs['filter_term']])
    metadata.append(["Exported Row Number: " + str(len(table_rows))])
    metadata.append([
        "Funding for The GeneNetwork: NIGMS (R01 GM123489, 2017-2021), NIDA (P30 DA044223, 2017-2022), NIA (R01AG043930, 2013-2018), NIAAA (U01 AA016662, U01 AA013499, U24 AA013513, U01 AA014425, 2006-2017), NIDA/NIMH/NIAAA (P20-DA 21131, 2001-2012), NCI MMHCC (U01CA105417), NCRR/BIRN (U24 RR021760)"
    ])
    metadata.append([])

    trait_list = []
    for trait in table_rows:
        trait_name, dataset_name, _hash = trait.split(":")
        trait_ob = create_trait(name=trait_name, dataset_name=dataset_name)
        trait_ob = retrieve_trait_info(trait_ob,
                                       trait_ob.dataset,
                                       get_qtl_info=True)
        trait_list.append(trait_ob)

    table_headers = [
        'Index', 'URL', 'Species', 'Group', 'Dataset', 'Record ID', 'Symbol',
        'Description', 'ProbeTarget', 'PubMed_ID', 'Chr', 'Mb', 'Alias',
        'Gene_ID', 'Homologene_ID', 'UniGene_ID', 'Strand_Probe',
        'Probe_set_specificity', 'Probe_set_BLAT_score',
        'Probe_set_BLAT_Mb_start', 'Probe_set_BLAT_Mb_end', 'QTL_Chr',
        'QTL_Mb', 'Locus_at_Peak', 'Max_LRS', 'P_value_of_MAX',
        'Mean_Expression'
    ]

    traits_by_group = sort_traits_by_group(trait_list)

    file_list = []
    for group in list(traits_by_group.keys()):
        group_traits = traits_by_group[group]
        buff = io.StringIO()
        writer = csv.writer(buff)
        csv_rows = []

        sample_headers = []
        for sample in group_traits[0].dataset.group.samplelist:
            sample_headers.append(sample)
            sample_headers.append(sample + "_SE")

        full_headers = table_headers + sample_headers

        for metadata_row in metadata:
            writer.writerow(metadata_row)

        csv_rows.append(full_headers)

        for i, trait in enumerate(group_traits):
            if getattr(trait, "symbol", None):
                trait_symbol = getattr(trait, "symbol")
            elif getattr(trait, "abbreviation", None):
                trait_symbol = getattr(trait, "abbreviation")
            else:
                trait_symbol = "N/A"
            row_contents = [
                i + 1, "https://genenetwork.org/show_trait?trait_id=" +
                str(trait.name) + "&dataset=" + str(trait.dataset.name),
                trait.dataset.group.species, trait.dataset.group.name,
                trait.dataset.name, trait.name, trait_symbol,
                getattr(trait, "description_display", "N/A"),
                getattr(trait, "probe_target_description", "N/A"),
                getattr(trait, "pubmed_id", "N/A"),
                getattr(trait, "chr", "N/A"),
                getattr(trait, "mb", "N/A"), trait.alias_fmt,
                getattr(trait, "geneid", "N/A"),
                getattr(trait, "homologeneid", "N/A"),
                getattr(trait, "unigeneid", "N/A"),
                getattr(trait, "strand_probe", "N/A"),
                getattr(trait, "probe_set_specificity", "N/A"),
                getattr(trait, "probe_set_blat_score", "N/A"),
                getattr(trait, "probe_set_blat_mb_start", "N/A"),
                getattr(trait, "probe_set_blat_mb_end", "N/A"),
                getattr(trait, "locus_chr", "N/A"),
                getattr(trait, "locus_mb", "N/A"),
                getattr(trait, "locus", "N/A"),
                getattr(trait, "lrs", "N/A"),
                getattr(trait, "pvalue", "N/A"),
                getattr(trait, "mean", "N/A")
            ]

            for sample in trait.dataset.group.samplelist:
                if sample in trait.data:
                    row_contents += [
                        trait.data[sample].value, trait.data[sample].variance
                    ]
                else:
                    row_contents += ["x", "x"]

            csv_rows.append(row_contents)

        csv_rows = list(
            map(list, itertools.zip_longest(*[row for row in csv_rows])))
        writer.writerows(csv_rows)
        csv_data = buff.getvalue()
        buff.close()

        file_name = group + "_traits.csv"
        file_list.append([file_name, csv_data])

    return file_list
Beispiel #17
0
    def __init__(self, start_vars):
        # get trait list from db (database name)
        # calculate correlation with Base vector and targets

        # Check parameters
        assert('corr_type' in start_vars)
        assert(is_str(start_vars['corr_type']))
        assert('dataset' in start_vars)
        # assert('group' in start_vars) permitted to be empty?
        assert('corr_sample_method' in start_vars)
        assert('corr_samples_group' in start_vars)
        assert('corr_dataset' in start_vars)
        assert('corr_return_results' in start_vars)
        if 'loc_chr' in start_vars:
            assert('min_loc_mb' in start_vars)
            assert('max_loc_mb' in start_vars)

        with Bench("Doing correlations"):
            if start_vars['dataset'] == "Temp":
                self.dataset = data_set.create_dataset(dataset_name = "Temp", dataset_type = "Temp", group_name = start_vars['group'])
                self.trait_id = start_vars['trait_id']
                self.this_trait = create_trait(dataset=self.dataset,
                                           name=self.trait_id,
                                           cellid=None)
            else:
                helper_functions.get_species_dataset_trait(self, start_vars)

            corr_samples_group = start_vars['corr_samples_group']

            self.sample_data = {}
            self.corr_type = start_vars['corr_type']
            self.corr_method = start_vars['corr_sample_method']
            self.min_expr = get_float(start_vars, 'min_expr')
            self.p_range_lower = get_float(start_vars, 'p_range_lower', -1.0)
            self.p_range_upper = get_float(start_vars, 'p_range_upper', 1.0)

            if ('loc_chr' in start_vars and
                'min_loc_mb' in start_vars and
                'max_loc_mb' in start_vars):

                self.location_type = get_string(start_vars, 'location_type')
                self.location_chr = get_string(start_vars, 'loc_chr')
                self.min_location_mb = get_int(start_vars, 'min_loc_mb')
                self.max_location_mb = get_int(start_vars, 'max_loc_mb')
            else:
                self.location_type = self.location_chr = self.min_location_mb = self.max_location_mb = None

            self.get_formatted_corr_type()
            self.return_number = int(start_vars['corr_return_results'])

            #The two if statements below append samples to the sample list based upon whether the user
            #rselected Primary Samples Only, Other Samples Only, or All Samples

            primary_samples = self.dataset.group.samplelist
            if self.dataset.group.parlist != None:
                primary_samples += self.dataset.group.parlist
            if self.dataset.group.f1list != None:
                primary_samples += self.dataset.group.f1list

            #If either BXD/whatever Only or All Samples, append all of that group's samplelist
            if corr_samples_group != 'samples_other':
                self.process_samples(start_vars, primary_samples)

            #If either Non-BXD/whatever or All Samples, get all samples from this_trait.data and
            #exclude the primary samples (because they would have been added in the previous
            #if statement if the user selected All Samples)
            if corr_samples_group != 'samples_primary':
                if corr_samples_group == 'samples_other':
                    primary_samples = [x for x in primary_samples if x not in (
                                    self.dataset.group.parlist + self.dataset.group.f1list)]
                self.process_samples(start_vars, list(self.this_trait.data.keys()), primary_samples)

            self.target_dataset = data_set.create_dataset(start_vars['corr_dataset'])
            self.target_dataset.get_trait_data(list(self.sample_data.keys()))

            self.header_fields = get_header_fields(self.target_dataset.type, self.corr_method)

            if self.target_dataset.type == "ProbeSet":
                self.filter_cols = [7, 6]
            elif self.target_dataset.type == "Publish":
                self.filter_cols = [6, 0]
            else:
                self.filter_cols = [4, 0]

            self.correlation_results = []

            self.correlation_data = {}

            if self.corr_type == "tissue":
                self.trait_symbol_dict = self.dataset.retrieve_genes("Symbol")

                tissue_corr_data = self.do_tissue_correlation_for_all_traits()
                if tissue_corr_data != None:
                    for trait in list(tissue_corr_data.keys())[:self.return_number]:
                        self.get_sample_r_and_p_values(trait, self.target_dataset.trait_data[trait])
                else:
                    for trait, values in list(self.target_dataset.trait_data.items()):
                        self.get_sample_r_and_p_values(trait, values)

            elif self.corr_type == "lit":
                self.trait_geneid_dict = self.dataset.retrieve_genes("GeneId")
                lit_corr_data = self.do_lit_correlation_for_all_traits()

                for trait in list(lit_corr_data.keys())[:self.return_number]:
                    self.get_sample_r_and_p_values(trait, self.target_dataset.trait_data[trait])

            elif self.corr_type == "sample":
                for trait, values in list(self.target_dataset.trait_data.items()):
                    self.get_sample_r_and_p_values(trait, values)

            self.correlation_data = collections.OrderedDict(sorted(list(self.correlation_data.items()),
                                                                   key=lambda t: -abs(t[1][0])))


            #ZS: Convert min/max chromosome to an int for the location range option
            range_chr_as_int = None
            for order_id, chr_info in list(self.dataset.species.chromosomes.chromosomes.items()):
                if 'loc_chr' in start_vars:
                    if chr_info.name == self.location_chr:
                        range_chr_as_int = order_id

            for _trait_counter, trait in enumerate(list(self.correlation_data.keys())[:self.return_number]):
                trait_object = create_trait(dataset=self.target_dataset, name=trait, get_qtl_info=True, get_sample_info=False)
                if not trait_object:
                    continue

                chr_as_int = 0
                for order_id, chr_info in list(self.dataset.species.chromosomes.chromosomes.items()):
                    if self.location_type == "highest_lod":
                        if chr_info.name == trait_object.locus_chr:
                            chr_as_int = order_id
                    else:
                        if chr_info.name == trait_object.chr:
                            chr_as_int = order_id

                if (float(self.correlation_data[trait][0]) >= self.p_range_lower and
                    float(self.correlation_data[trait][0]) <= self.p_range_upper):

                    if (self.target_dataset.type == "ProbeSet" or self.target_dataset.type == "Publish") and bool(trait_object.mean):
                        if (self.min_expr != None) and (float(trait_object.mean) < self.min_expr):
                            continue

                    if range_chr_as_int != None and (chr_as_int != range_chr_as_int):
                        continue
                    if self.location_type == "highest_lod":
                        if (self.min_location_mb != None) and (float(trait_object.locus_mb) < float(self.min_location_mb)):
                            continue
                        if (self.max_location_mb != None) and (float(trait_object.locus_mb) > float(self.max_location_mb)):
                            continue
                    else:
                        if (self.min_location_mb != None) and (float(trait_object.mb) < float(self.min_location_mb)):
                            continue
                        if (self.max_location_mb != None) and (float(trait_object.mb) > float(self.max_location_mb)):
                            continue

                    (trait_object.sample_r,
                    trait_object.sample_p,
                    trait_object.num_overlap) = self.correlation_data[trait]

                    # Set some sane defaults
                    trait_object.tissue_corr = 0
                    trait_object.tissue_pvalue = 0
                    trait_object.lit_corr = 0
                    if self.corr_type == "tissue" and tissue_corr_data != None:
                        trait_object.tissue_corr = tissue_corr_data[trait][1]
                        trait_object.tissue_pvalue = tissue_corr_data[trait][2]
                    elif self.corr_type == "lit":
                        trait_object.lit_corr = lit_corr_data[trait][1]

                    self.correlation_results.append(trait_object)

            if self.corr_type != "lit" and self.dataset.type == "ProbeSet" and self.target_dataset.type == "ProbeSet":
                self.do_lit_correlation_for_trait_list()

            if self.corr_type != "tissue" and self.dataset.type == "ProbeSet" and self.target_dataset.type == "ProbeSet":
                self.do_tissue_correlation_for_trait_list()

        self.json_results = generate_corr_json(self.correlation_results, self.this_trait, self.dataset, self.target_dataset)
def correlation_json_for_table(correlation_data, this_trait, this_dataset,
                               target_dataset_ob):
    """Return JSON data for use with the DataTable in the correlation result page

    Keyword arguments:
    correlation_data -- Correlation results
    this_trait -- Trait being correlated against a dataset, as a dict
    this_dataset -- Dataset of this_trait, as a dict
    target_dataset_ob - Target dataset, as a Dataset ob
    """
    this_trait = correlation_data['this_trait']
    this_dataset = correlation_data['this_dataset']
    target_dataset = target_dataset_ob.as_dict()

    corr_results = correlation_data['correlation_results']
    results_list = []

    file_name = f"{target_dataset['name']}_metadata.json"

    file_path = os.path.join(TMPDIR, file_name)
    new_traits_metadata = {}

    try:
        with open(file_path, "r+") as file_handler:
            dataset_metadata = json.load(file_handler)

    except FileNotFoundError:
        Path(file_path).touch(exist_ok=True)
        dataset_metadata = {}

    for i, trait_dict in enumerate(corr_results):
        trait_name = list(trait_dict.keys())[0]
        trait = trait_dict[trait_name]

        target_trait = dataset_metadata.get(trait_name)
        if target_trait is None:
            target_trait_ob = create_trait(dataset=target_dataset_ob,
                                           name=trait_name,
                                           get_qtl_info=True)
            target_trait = jsonable(target_trait_ob, target_dataset_ob)
            new_traits_metadata[trait_name] = target_trait
        if target_trait['view'] == False:
            continue
        results_dict = {}
        results_dict['index'] = i + 1
        results_dict['trait_id'] = target_trait['name']
        results_dict['dataset'] = target_dataset['name']
        results_dict['hmac'] = hmac.data_hmac('{}:{}'.format(
            target_trait['name'], target_dataset['name']))
        results_dict['sample_r'] = f"{float(trait['corr_coefficient']):.3f}"
        results_dict['num_overlap'] = trait['num_overlap']
        results_dict['sample_p'] = f"{float(trait['p_value']):.3e}"
        if target_dataset['type'] == "ProbeSet":
            results_dict['symbol'] = target_trait['symbol']
            results_dict['description'] = "N/A"
            results_dict['location'] = target_trait['location']
            results_dict['mean'] = "N/A"
            results_dict['additive'] = "N/A"
            if bool(target_trait['description']):
                results_dict['description'] = target_trait['description']
            if bool(target_trait['mean']):
                results_dict['mean'] = f"{float(target_trait['mean']):.3f}"
            try:
                results_dict[
                    'lod_score'] = f"{float(target_trait['lrs_score']) / 4.61:.1f}"
            except:
                results_dict['lod_score'] = "N/A"
            results_dict['lrs_location'] = target_trait['lrs_location']
            if bool(target_trait['additive']):
                results_dict[
                    'additive'] = f"{float(target_trait['additive']):.3f}"
            results_dict['lit_corr'] = "--"
            results_dict['tissue_corr'] = "--"
            results_dict['tissue_pvalue'] = "--"
            if this_dataset['type'] == "ProbeSet":
                if 'lit_corr' in trait:
                    results_dict[
                        'lit_corr'] = f"{float(trait['lit_corr']):.3f}"
                if 'tissue_corr' in trait:
                    results_dict[
                        'tissue_corr'] = f"{float(trait['tissue_corr']):.3f}"
                    results_dict[
                        'tissue_pvalue'] = f"{float(trait['tissue_p_val']):.3e}"
        elif target_dataset['type'] == "Publish":
            results_dict['abbreviation_display'] = "N/A"
            results_dict['description'] = "N/A"
            results_dict['mean'] = "N/A"
            results_dict['authors_display'] = "N/A"
            results_dict['additive'] = "N/A"
            results_dict['pubmed_link'] = "N/A"
            results_dict['pubmed_text'] = "N/A"

            if bool(target_trait['abbreviation']):
                results_dict['abbreviation_display'] = target_trait[
                    'abbreviation']
            if bool(target_trait['description']):
                results_dict['description'] = target_trait['description']
            if bool(target_trait['mean']):
                results_dict['mean'] = f"{float(target_trait['mean']):.3f}"
            if bool(target_trait['authors']):
                authors_list = target_trait['authors'].split(',')
                if len(authors_list) > 6:
                    results_dict['authors_display'] = ", ".join(
                        authors_list[:6]) + ", et al."
                else:
                    results_dict['authors_display'] = target_trait['authors']
            if 'pubmed_id' in target_trait:
                results_dict['pubmed_link'] = target_trait['pubmed_link']
                results_dict['pubmed_text'] = target_trait['pubmed_text']
            try:
                results_dict[
                    'lod_score'] = f"{float(target_trait['lrs_score']) / 4.61:.1f}"
            except:
                results_dict['lod_score'] = "N/A"
            results_dict['lrs_location'] = target_trait['lrs_location']
            if bool(target_trait['additive']):
                results_dict[
                    'additive'] = f"{float(target_trait['additive']):.3f}"
        else:
            results_dict['location'] = target_trait['location']

        results_list.append(results_dict)

    if bool(new_traits_metadata):
        # that means new traits exists
        dataset_metadata.update(new_traits_metadata)
        with open(file_path, "w+") as file_handler:
            json.dump(dataset_metadata, file_handler)
    return json.dumps(results_list)
    def __init__(self, kw):
        assert('type' in kw)
        assert('terms' in kw)

        self.type = kw['type']
        self.terms = kw['terms']
        assert(is_str(self.type))

        if self.type == "gene":
            sql = """
                SELECT
                Species.`Name` AS species_name,
                InbredSet.`Name` AS inbredset_name,
                Tissue.`Name` AS tissue_name,
                ProbeSetFreeze.Name AS probesetfreeze_name,
                ProbeSetFreeze.FullName AS probesetfreeze_fullname,
                ProbeSet.Name AS probeset_name,
                ProbeSet.Symbol AS probeset_symbol,
                CAST(ProbeSet.`description` AS BINARY) AS probeset_description,
                ProbeSet.Chr AS chr,
                ProbeSet.Mb AS mb,
                ProbeSetXRef.Mean AS mean,
                ProbeSetXRef.LRS AS lrs,
                ProbeSetXRef.`Locus` AS locus,
                ProbeSetXRef.`pValue` AS pvalue,
                ProbeSetXRef.`additive` AS additive,
                ProbeSetFreeze.Id AS probesetfreeze_id,
                Geno.Chr as geno_chr,
                Geno.Mb as geno_mb
                FROM Species 
                INNER JOIN InbredSet ON InbredSet.`SpeciesId`=Species.`Id` 
                INNER JOIN ProbeFreeze ON ProbeFreeze.InbredSetId=InbredSet.`Id` 
                INNER JOIN Tissue ON ProbeFreeze.`TissueId`=Tissue.`Id` 
                INNER JOIN ProbeSetFreeze ON ProbeSetFreeze.ProbeFreezeId=ProbeFreeze.Id 
                INNER JOIN ProbeSetXRef ON ProbeSetXRef.ProbeSetFreezeId=ProbeSetFreeze.Id 
                INNER JOIN ProbeSet ON ProbeSet.Id = ProbeSetXRef.ProbeSetId 
                LEFT JOIN Geno ON ProbeSetXRef.Locus = Geno.Name AND Geno.SpeciesId = Species.Id
                WHERE ( MATCH (ProbeSet.Name,ProbeSet.description,ProbeSet.symbol,ProbeSet.alias,ProbeSet.GenbankId, ProbeSet.UniGeneId, ProbeSet.Probe_Target_Description) AGAINST ('%s' IN BOOLEAN MODE) )
                AND ProbeSetFreeze.confidentiality < 1
                AND ProbeSetFreeze.public > 0
                ORDER BY species_name, inbredset_name, tissue_name, probesetfreeze_name, probeset_name
                LIMIT 6000
                """ % (self.terms)
            with Bench("Running query"):
                logger.sql(sql)
                re = g.db.execute(sql).fetchall()

            trait_list = []
            dataset_to_permissions = {}
            with Bench("Creating trait objects"):
                for i, line in enumerate(re):
                    this_trait = {}
                    this_trait['index'] = i + 1
                    this_trait['name'] = line[5]
                    this_trait['dataset'] = line[3]
                    this_trait['dataset_fullname'] = line[4]
                    this_trait['hmac'] = hmac.data_hmac('{}:{}'.format(line[5], line[3]))
                    this_trait['species'] = line[0]
                    this_trait['group'] = line[1]
                    this_trait['tissue'] = line[2]
                    this_trait['symbol'] = line[6]
                    if line[7]:
                        this_trait['description'] = line[7].decode('utf-8', 'replace')
                    else:
                        this_trait['description'] = "N/A"
                    this_trait['location_repr'] = 'N/A'
                    if (line[8] != "NULL" and line[8] != "") and (line[9] != 0):
                        this_trait['location_repr'] = 'Chr%s: %.6f' % (line[8], float(line[9]))
                    try:
                        this_trait['mean'] = '%.3f' % line[10]
                    except:
                        this_trait['mean'] = "N/A"
                    this_trait['LRS_score_repr'] = "N/A"
                    if line[11] != "" and line[11] != None:
                        this_trait['LRS_score_repr'] = '%3.1f' % line[11]
                    this_trait['additive'] = "N/A"
                    if line[14] != "" and line[14] != None:
                        this_trait['additive'] = '%.3f' % line[14]
                    this_trait['dataset_id'] = line[15]
                    this_trait['locus_chr'] = line[16]
                    this_trait['locus_mb'] = line[17]

                    dataset_ob = SimpleNamespace(id=this_trait["dataset_id"], type="ProbeSet",species=this_trait["species"])
                    if dataset_ob.id not in dataset_to_permissions:
                        permissions = check_resource_availability(dataset_ob)
                        dataset_to_permissions[dataset_ob.id] = permissions
                    else:
                        pemissions = dataset_to_permissions[dataset_ob.id]
                    if "view" not in permissions['data']:
                        continue

                    max_lrs_text = "N/A"
                    if this_trait['locus_chr'] != None and this_trait['locus_mb'] != None:
                        max_lrs_text = "Chr" + str(this_trait['locus_chr']) + ": " + str(this_trait['locus_mb'])
                    this_trait['max_lrs_text'] = max_lrs_text

                    trait_list.append(this_trait)

            self.trait_count = len(trait_list)
            self.trait_list = json.dumps(trait_list)

            self.header_fields = ['Index',
                                'Record',
                                'Species',
                                'Group',
                                'Tissue',
                                'Dataset',
                                'Symbol',
                                'Description',
                                'Location',
                                'Mean',
                                'Max LRS',
                                'Max LRS Location',
                                'Additive Effect']

        elif self.type == "phenotype":
            search_term = self.terms
            group_clause = ""
            if "_" in self.terms:
                if len(self.terms.split("_")[0]) == 3:
                    search_term = self.terms.split("_")[1]
                    group_clause = "AND InbredSet.`InbredSetCode` = '{}'".format(self.terms.split("_")[0])
            sql = """
                SELECT
                Species.`Name`,
                InbredSet.`Name`,
                PublishFreeze.`Name`,
                PublishFreeze.`FullName`,
                PublishXRef.`Id`,
                CAST(Phenotype.`Pre_publication_description` AS BINARY),
                CAST(Phenotype.`Post_publication_description` AS BINARY),
                Publication.`Authors`,
                Publication.`Year`,
                Publication.`PubMed_ID`,
                PublishXRef.`LRS`,
                PublishXRef.`additive`,
                InbredSet.`InbredSetCode`,
                PublishXRef.`mean`
                FROM Species,InbredSet,PublishFreeze,PublishXRef,Phenotype,Publication
                WHERE PublishXRef.`InbredSetId`=InbredSet.`Id`
                AND PublishFreeze.`InbredSetId`=InbredSet.`Id`
                AND InbredSet.`SpeciesId`=Species.`Id`
                {0}
                AND PublishXRef.`PhenotypeId`=Phenotype.`Id`
                AND PublishXRef.`PublicationId`=Publication.`Id`
                AND	  (Phenotype.Post_publication_description REGEXP "[[:<:]]{1}[[:>:]]"
                    OR Phenotype.Pre_publication_description REGEXP "[[:<:]]{1}[[:>:]]"
                    OR Phenotype.Pre_publication_abbreviation REGEXP "[[:<:]]{1}[[:>:]]"
                    OR Phenotype.Post_publication_abbreviation REGEXP "[[:<:]]{1}[[:>:]]"
                    OR Phenotype.Lab_code REGEXP "[[:<:]]{1}[[:>:]]"
                    OR Publication.PubMed_ID REGEXP "[[:<:]]{1}[[:>:]]"
                    OR Publication.Abstract REGEXP "[[:<:]]{1}[[:>:]]"
                    OR Publication.Title REGEXP "[[:<:]]{1}[[:>:]]"
                    OR Publication.Authors REGEXP "[[:<:]]{1}[[:>:]]"
                    OR PublishXRef.Id REGEXP "[[:<:]]{1}[[:>:]]")
                ORDER BY Species.`Name`, InbredSet.`Name`, PublishXRef.`Id`
                LIMIT 6000
                """.format(group_clause, search_term)
            logger.sql(sql)
            re = g.db.execute(sql).fetchall()
            trait_list = []
            with Bench("Creating trait objects"):
                for i, line in enumerate(re):
                    this_trait = {}
                    this_trait['index'] = i + 1
                    this_trait['name'] = str(line[4])
                    if len(str(line[12])) == 3:
                        this_trait['display_name'] = str(line[12]) + "_" + this_trait['name']
                    else:
                        this_trait['display_name'] = this_trait['name']
                    this_trait['dataset'] = line[2]
                    this_trait['dataset_fullname'] = line[3]
                    this_trait['hmac'] = hmac.data_hmac('{}:{}'.format(line[4], line[2]))
                    this_trait['species'] = line[0]
                    this_trait['group'] = line[1]
                    if line[9] != None and line[6] != None:
                        this_trait['description'] = line[6].decode('utf-8', 'replace')
                    elif line[5] != None:
                        this_trait['description'] = line[5].decode('utf-8', 'replace')
                    else:
                        this_trait['description'] = "N/A"
                    if line[13] != None and line[13] != "":
                        this_trait['mean'] = line[13]
                    else:
                        this_trait['mean'] = "N/A"
                    this_trait['authors'] = line[7]
                    this_trait['year'] = line[8]
                    if this_trait['year'].isdigit():
                        this_trait['pubmed_text'] = this_trait['year']
                    else:
                        this_trait['pubmed_text'] = "N/A"
                    if line[9] != "" and line[9] != None:
                        this_trait['pubmed_link'] = webqtlConfig.PUBMEDLINK_URL % line[8]
                    else:
                        this_trait['pubmed_link'] = "N/A"
                        if line[12]:
                            this_trait['display_name'] = line[12] + "_" + str(this_trait['name'])
                    this_trait['LRS_score_repr'] = "N/A"
                    if line[10] != "" and line[10] != None:
                        this_trait['LRS_score_repr'] = '%3.1f' % line[10]
                    this_trait['additive'] = "N/A"
                    if line[11] != "" and line[11] != None:
                        this_trait['additive'] = '%.3f' % line[11]

                    this_trait['max_lrs_text'] = "N/A"
                    trait_ob = create_trait(dataset_name=this_trait['dataset'], name=this_trait['name'], get_qtl_info=True, get_sample_info=False)
                    if not trait_ob:
                        continue
                    if this_trait['dataset'] == this_trait['group'] + "Publish":
                      try:
                        if trait_ob.locus_chr != "" and trait_ob.locus_mb != "":
                            this_trait['max_lrs_text'] = "Chr" + str(trait_ob.locus_chr) + ": " + str(trait_ob.locus_mb)
                      except:
                          this_trait['max_lrs_text'] = "N/A"

                    trait_list.append(this_trait)

            self.trait_count = len(trait_list)
            self.trait_list = json.dumps(trait_list)

            self.header_fields = ['Index',
                                'Species',
                                'Group',
                                'Record',
                                'Description',
                                'Authors',
                                'Year',
                                'Max LRS',
                                'Max LRS Location',
                                'Additive Effect']
    def gen_search_result(self):
        """
        Get the info displayed in the search result table from the set of results computed in
        the "search" function

        """
        trait_list = []
        json_trait_list = []

        species = webqtlDatabaseFunction.retrieve_species(
            self.dataset.group.name)
        # result_set represents the results for each search term; a search of
        # "shh grin2b" would have two sets of results, one for each term
        logger.debug("self.results is:", pf(self.results))

        for index, result in enumerate(self.results):
            if not result:
                continue

            #### Excel file needs to be generated ####

            trait_dict = {}
            trait_id = result[0]
            this_trait = create_trait(dataset=self.dataset,
                                      name=trait_id,
                                      get_qtl_info=True,
                                      get_sample_info=False)
            if this_trait:
                trait_dict['index'] = index + 1
                trait_dict['name'] = this_trait.name
                if this_trait.dataset.type == "Publish":
                    trait_dict['display_name'] = this_trait.display_name
                else:
                    trait_dict['display_name'] = this_trait.name
                trait_dict['dataset'] = this_trait.dataset.name
                trait_dict['hmac'] = hmac.data_hmac('{}:{}'.format(
                    this_trait.name, this_trait.dataset.name))
                if this_trait.dataset.type == "ProbeSet":
                    trait_dict['symbol'] = this_trait.symbol
                    trait_dict['description'] = "N/A"
                    if this_trait.description_display:
                        trait_dict[
                            'description'] = this_trait.description_display
                    trait_dict['location'] = this_trait.location_repr
                    trait_dict['mean'] = "N/A"
                    trait_dict['additive'] = "N/A"
                    if this_trait.mean != "" and this_trait.mean != None:
                        trait_dict['mean'] = f"{this_trait.mean:.3f}"
                    try:
                        trait_dict[
                            'lod_score'] = f"{float(this_trait.LRS_score_repr) / 4.61:.1f}"
                    except:
                        trait_dict['lod_score'] = "N/A"
                    trait_dict['lrs_location'] = this_trait.LRS_location_repr
                    if this_trait.additive != "":
                        trait_dict['additive'] = f"{this_trait.additive:.3f}"
                elif this_trait.dataset.type == "Geno":
                    trait_dict['location'] = this_trait.location_repr
                elif this_trait.dataset.type == "Publish":
                    trait_dict['description'] = "N/A"
                    if this_trait.description_display:
                        trait_dict[
                            'description'] = this_trait.description_display
                    trait_dict['authors'] = this_trait.authors
                    trait_dict['pubmed_id'] = "N/A"
                    if this_trait.pubmed_id:
                        trait_dict['pubmed_id'] = this_trait.pubmed_id
                        trait_dict['pubmed_link'] = this_trait.pubmed_link
                    trait_dict['pubmed_text'] = this_trait.pubmed_text
                    trait_dict['mean'] = "N/A"
                    if this_trait.mean != "" and this_trait.mean != None:
                        trait_dict['mean'] = f"{this_trait.mean:.3f}"
                    try:
                        trait_dict[
                            'lod_score'] = f"{float(this_trait.LRS_score_repr) / 4.61:.1f}"
                    except:
                        trait_dict['lod_score'] = "N/A"
                    trait_dict['lrs_location'] = this_trait.LRS_location_repr
                    trait_dict['additive'] = "N/A"
                    if this_trait.additive != "":
                        trait_dict['additive'] = f"{this_trait.additive:.3f}"
                # Convert any bytes in dict to a normal utf-8 string
                for key in trait_dict.keys():
                    if isinstance(trait_dict[key], bytes):
                        trait_dict[key] = trait_dict[key].decode('utf-8')
                trait_list.append(trait_dict)

        self.trait_list = trait_list

        if self.dataset.type == "ProbeSet":
            self.header_data_names = [
                'index', 'display_name', 'symbol', 'description', 'location',
                'mean', 'lrs_score', 'lrs_location', 'additive'
            ]
        elif self.dataset.type == "Publish":
            self.header_data_names = [
                'index', 'display_name', 'description', 'mean', 'authors',
                'pubmed_text', 'lrs_score', 'lrs_location', 'additive'
            ]
        elif self.dataset.type == "Geno":
            self.header_data_names = ['index', 'display_name', 'location']