def load_genos(file): genotypes = [] file_geno = open(file, 'r') for line in file_geno: line = line.strip() if line.startswith('#'): continue if line.startswith('@'): continue cells = line.split() if line.startswith("Chr"): strains = cells[4:] strains = [strain.lower() for strain in strains] continue genotype = {} genotype['chr'] = cells[0] genotype['locus'] = cells[1] genotype['cm'] = cells[2] genotype['mb'] = cells[3] values = cells[4:] values = [to_number(value) for value in values] genotype['values'] = values genotype['dicvalues'] = utilities.to_dic(strains, values) genotypes.append(genotype) return strains, genotypes
def probesetfreeze_item(strains, dir, probesetfreeze): probesetfreezeid = probesetfreeze[0] probesetfreezename = probesetfreeze[1] probesetfreezefullname = probesetfreeze[2] file = open("%s/ProbeSetFreezeId_%d_FullName_%s.txt" % (dir, probesetfreezeid, probesetfreezename), "w+") file.write('"ID",') file.write(','.join(['"%s"' % strain[1] for strain in strains])) file.write("\n") file.flush() probesetxrefs = probesets.get_probesetxref(probesetfreezeid) print(len(probesetxrefs)) for probesetxref in probesetxrefs: probesetid = probesetxref[0] probesetdataid = probesetxref[1] probeset = probesets.get_probeset(probesetid) probesetname = probeset[1] probesetdata = probesets.get_probesetdata(probesetdataid) if len(probesetdata) == 0: continue probesetdata = zip(*probesetdata) probesetdata = utilities.to_dic(probesetdata[1], probesetdata[2]) # file.write('"%s"' % probesetname) # for strain in strains: strainname = strain[1] if strainname in probesetdata: value = probesetdata[strainname] else: value = '' file.write(',"%s"' % value) file.write('\n') file.flush() # file.close()
def output_pheno(inbredsetid, file): # file = open(file, 'w') # strains = datastructure.get_strains(inbredsetid) print("get %d strains" % (len(strains))) print("strains: %s" % str(strains)) # publishxrefs = phenotypes.get_publishxrefs(inbredsetid) print("get %d publishxrefs" % (len(publishxrefs))) # file.write("%s\t" % "PhenotypeID") file.write("%s\t" % "PhenotypeName") for strain in strains: strainname = strain[1] file.write("%s\t" % strainname) file.write("\n") file.flush() # for publishxref in publishxrefs: # publishxrefid = publishxref[0] phenotypeid = publishxref[1] phenotype = phenotypes.get_phenotype(phenotypeid) publicationid = publishxref[2] publication = phenotypes.get_publication(publicationid) publishdataid = publishxref[3] publishdata = phenotypes.get_publishdata(publishdataid) publishdata = zip(*publishdata) if len(publishdata)==0: continue publishdata = utilities.to_dic([strain.lower() for strain in publishdata[1]], publishdata[2]) file.write("%s\t" % publishxrefid) phenotypename = "%s;%s;%s" % (phenotype[0], phenotype[1], phenotype[2]) phenotypename = re.sub('\s+', ' ', phenotypename) phenotypename = phenotypename.strip() file.write("%s\t" % phenotypename) # for strain in strains: strainname = strain[1] strainname = strainname.lower() if strainname in publishdata: value = publishdata[strainname] else: value = 'x' file.write("%s\t" % value) # file.write("\n") file.flush() # file.flush() file.close()
def correlations(outputdir, genos, probesetfreeze): print probesetfreeze probesetfreezeid = probesetfreeze[0] probesetfreezename = probesetfreeze[1] probesetfreezefullname = probesetfreeze[2] # outputfile = open("%s/%d_%s.txt" % (outputdir, probesetfreezeid, probesetfreezename), "w+") outputfile.write("%s\t" % "ProbeSet Id") outputfile.write("%s\t" % "ProbeSet Name") outputfile.write("%s\t" % "Geno Name") outputfile.write("%s\t" % "Overlap Number") outputfile.write("%s\t" % "Pearson r") outputfile.write("%s\t" % "Pearson p") outputfile.write("%s\t" % "Spearman r") outputfile.write("%s\t" % "Spearman p") outputfile.write("\n") outputfile.flush() # probesetxrefs = probesets.get_probesetxref(probesetfreezeid) print "Get %d probesetxrefs" % (len(probesetxrefs)) # for probesetxref in probesetxrefs: # probesetid = probesetxref[0] probesetdataid = probesetxref[1] probeset = probesets.get_probeset(probesetid) probesetname = probeset[1] probesetdata = probesets.get_probesetdata(probesetdataid) probesetdata = zip(*probesetdata) probesetdata = utilities.to_dic([strain.lower() for strain in probesetdata[1]], probesetdata[2]) # for geno in genos: genoname = geno['locus'] outputfile.write("%s\t" % probesetid) outputfile.write("%s\t" % probesetname) outputfile.write("%s\t" % genoname) # dic1 = geno['dicvalues'] dic2 = probesetdata keys, values1, values2 = utilities.overlap(dic1, dic2) rs = calculate.correlation(values1, values2) # outputfile.write("%s\t" % len(keys)) outputfile.write("%s\t" % rs[0][0]) outputfile.write("%s\t" % rs[0][1]) outputfile.write("%s\t" % rs[1][0]) outputfile.write("%s\t" % rs[1][1]) outputfile.write("\n") outputfile.flush() # outputfile.close()
def generate_probesets(probesetfreezesfile, outputdir): file = open(probesetfreezesfile, 'r') for line in file: line = line.strip() cells = line.split() probesetfreezeid = cells[0] probesetfreeze = datastructure.get_probesetfreeze(probesetfreezeid) probesetfreezeid = probesetfreeze[0] probesetfreezename = probesetfreeze[1] inbredset = datastructure.get_inbredset(probesetfreezeid) inbredsetid = inbredset[0] strains = datastructure.get_strains(inbredsetid) # outputfile = open("%s/%d_%s.txt" % (outputdir, probesetfreezeid, probesetfreezename), "w+") outputfile.write("%s\t" % "ProbeSet Id") outputfile.write("%s\t" % "ProbeSet Name") outputfile.write('\t'.join([strain[1].upper() for strain in strains])) outputfile.write("\n") outputfile.flush() # probesetxrefs = probesets.get_probesetxref(probesetfreezeid) print probesetfreeze print len(probesetxrefs) for probesetxref in probesetxrefs: probesetid = probesetxref[0] probesetdataid = probesetxref[1] probeset = probesets.get_probeset(probesetid) probesetname = probeset[1] probesetdata = probesets.get_probesetdata(probesetdataid) probesetdata = zip(*probesetdata) probesetdata = utilities.to_dic([strain.lower() for strain in probesetdata[1]], probesetdata[2]) # outputfile.write("%s\t" % probesetid) outputfile.write("%s\t" % probesetname) # for strain in strains: strainname = strain[1] strainname = strainname.lower() if strainname in probesetdata: value = probesetdata[strainname] else: value = 'x' outputfile.write("%s\t" % value) outputfile.write("\n") outputfile.flush() # outputfile.close() file.close()
def bxd_pheno(file): # file = open(file, 'w') inbredsetid = 1 # strains = datastructure.get_strains(inbredsetid) print("get %d strains" % (len(strains))) # publishxrefs = phenotypes.get_publishxrefs(inbredsetid) print("get %d publishxrefs" % (len(publishxrefs))) # file.write("%s\t" % "PhenotypeID") file.write("%s\t" % "PhenotypeName") for strain in strains: strainname = strain[1] file.write("%s-expression\t" % strainname) file.write("%s-N\t" % strainname) file.write("%s-SE\t" % strainname) file.write("\n") file.flush() # for publishxref in publishxrefs: # publishxrefid = publishxref[0] phenotypeid = publishxref[1] publicationid = publishxref[2] publishdataid = publishxref[3] # phenotype = phenotypes.get_phenotype(phenotypeid) publication = phenotypes.get_publication(publicationid) # publishdata = phenotypes.get_publishdata(publishdataid) publishdata = zip(*publishdata) if len(publishdata) == 0: publishdata = {} else: publishdata = utilities.to_dic([strain.lower() for strain in publishdata[1]], publishdata[2]) # publishdatan = phenotypes.get_publishdatan(publishdataid) publishdatan = zip(*publishdatan) if len(publishdatan) == 0: publishdatan = {} else: publishdatan = utilities.to_dic([strain.lower() for strain in publishdatan[1]], publishdatan[2]) # publishdatase = phenotypes.get_publishdatase(publishdataid) publishdatase = zip(*publishdatase) if len(publishdatase) == 0: publishdatase = {} else: publishdatase = utilities.to_dic([strain.lower() for strain in publishdatase[1]], publishdatase[2]) # file.write("%s\t" % publishxrefid) phenotypename = "%s;%s;%s" % (phenotype[0], phenotype[1], phenotype[2]) phenotypename = re.sub('\s+', ' ', phenotypename) file.write("%s\t" % phenotypename) # for strain in strains: strainname = strain[1] strainname = strainname.lower() # if strainname in publishdata: value = publishdata[strainname] else: value = 'x' file.write("%s\t" % value) # if strainname in publishdatan: value = publishdatan[strainname] else: value = 'x' file.write("%s\t" % value) # if strainname in publishdatase: value = publishdatase[strainname] else: value = 'x' file.write("%s\t" % value) file.write("\n") file.flush() file.close()
def bxd_geno_pheno_correlations(file): # file = open(file, 'w') inbredsetid = 1 genofile = "/home/leiyan/gn/web/genotypes/BXD.geno" # t = genotypes.load_genos(genofile) genostrains = t[0] genos = t[1] print("From geno file, get %d strains" % (len(genostrains))) print("From geno file, get %d genos" % (len(genos))) # publishxrefs = phenotypes.get_publishxrefs(inbredsetid) print("get %d publishxrefs" % (len(publishxrefs))) # file.write("%s\t" % "PhenotypeID") file.write("%s\t" % "PhenotypeName") file.write("%s\t" % "MarkerName") file.write("%s\t" % "MarkerChromosome") file.write("%s\t" % "MarkerCentimorgan") file.write("%s\t" % "MarkerMb") file.write("%s\t" % "PearsonCorrelation") file.write("%s\t" % "PearsonPvalue") file.write("%s\t" % "SpearmanCorrelation") file.write("%s\t" % "SpearmanPvalue") file.write("%s\t" % "Number_of_BXDs_used") file.write("\n") file.flush() # for publishxref in publishxrefs: # publishxrefid = publishxref[0] phenotypeid = publishxref[1] phenotype = phenotypes.get_phenotype(phenotypeid) publicationid = publishxref[2] publication = phenotypes.get_publication(publicationid) publishdataid = publishxref[3] publishdata = phenotypes.get_publishdata(publishdataid) publishdata = zip(*publishdata) if len(publishdata) != 3: print("publishdata - %s: %d" % (publishxrefid, len(publishdata))) continue publishdata = utilities.to_dic([strain.lower() for strain in publishdata[1]], publishdata[2]) # for geno in genos: # dic1 = geno['dicvalues'] dic2 = publishdata keys, values1, values2 = utilities.overlap(dic1, dic2) rs = calculate.correlation(values1, values2) # file.write("%s\t" % publishxrefid) file.write("%s;%s;%s\t" % (phenotype[0], phenotype[1], phenotype[2])) file.write("%s\t" % geno['locus']) file.write("%s\t" % geno['chr']) file.write("%s\t" % geno['cm']) file.write("%s\t" % geno['mb']) file.write("%s\t" % rs[0][0]) file.write("%s\t" % rs[0][1]) file.write("%s\t" % rs[1][0]) file.write("%s\t" % rs[1][1]) file.write("%s\t" % len(keys)) file.write("\n") file.flush() # file.close()