def annotateENcPrime(taxId, overwrite=False): encPropValue = getSpeciesProperty(taxId, 'ENc') encPrimePropValue = getSpeciesProperty(taxId, 'ENc-prime') # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY # ###return (taxId, 0.0, 1.0, False) # return old values (last value indicates this value are old) # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY # if (encPropValue[0] is None) or (encPrimePropValue[0] is None) or overwrite: ENc, ENc_prime = calculateENcPrimeForSpecies(taxId) assert ( ENc < 75.0 and ENc > 10.0 ) # The actual extreme values for ENc are not clear to me, but let's do a sanity check assert (ENc_prime < 75.0 and ENc_prime > 10.0) setSpeciesProperty(taxId, 'ENc', str(ENc), "ENCprime (custom version)") setSpeciesProperty(taxId, 'ENc-prime', str(ENc_prime), "ENCprime (custom version)") else: return ( taxId, encPropValue[0], encPrimePropValue[0], False ) # return old values (last value indicates this value are old) return (taxId, ENc, ENc_prime, True ) # return values (last value indicates new values)
def run(): positiveDict = dict(positiveGroup) negativeDict = dict(negativeGroup) totalCount = 0 positiveCount = 0 negativeCount = 0 for taxId in allSpeciesSource(): totalCount += 1 #if not getSpeciesProperty(taxId, 'algae')[0] is None: # continue lineage = frozenset(ncbiTaxa.get_lineage(taxId)) algeaClassification = None if lineage.intersection(algaeDefinition_ExcludedGroups): algeaClassification = ('No', 'Excluded taxonomic group') elif taxId in positiveDict: algeaClassification = ('Yes', positiveDict[taxId]) elif taxId in negativeDict: algeaClassification = ('No', negativeDict[taxId]) if not algeaClassification is None: setSpeciesProperty(taxId, "algae", algeaClassification[0], algeaClassification[1], overwrite=True) # Done; update counts if algeaClassification[0] == 'Yes': positiveCount += 1 if lineage.intersection(algaeDefinition_ExcludedGroups): print("Warning: possible false annotation: %d" % taxId) if not lineage.intersection(algaeDefinition_IncludedGroups): print("Warning: possible false annotation: %d" % taxId) elif algeaClassification[0] == 'No': negativeCount += 1 else: assert (False) else: if lineage.intersection(algaeDefinition_IncludedGroups): print("Warning: check unannotated possible algae: %d" % taxId) print("Finished %d species (%d annotated; %d positive, %d negative)" % (totalCount, positiveCount + negativeCount, positiveCount, negativeCount))
def runDistributed(): for taxId in allSpeciesSource(): currentProp = getSpeciesProperty(taxId, 'paired-mRNA-fraction') if currentProp[0] is None: continue if currentProp[1] == "computed": origVal = float(currentProp[0]) fixedVal = origVal * 2 setSpeciesProperty(taxId, "paired-mRNA-fraction", "%.4g" % fixedVal, "computed (v2)", overwrite=True) print("Fixed %d: %.4g -> %.4g" % (taxId, origVal, fixedVal))
def annotateDCBS(taxId, overwrite=False): dcbsPropValue = getSpeciesProperty(taxId, 'DCBS-geomean') # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY # ## return (taxId, 0.0, False) # return old values (last value indicates this value are old) # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY # if (dcbsPropValue[0] is None) or overwrite: DCBS = float(calcDCBS(taxId)) #assert(ENc_prime<75.0 and ENc_prime>10.0) setSpeciesProperty(taxId, 'DCBS-geomean', str(DCBS), "DCBS (matlab, Renana)") else: return ( taxId, dcbsPropValue[0], False ) # return old values (last value indicates this value are old) return (taxId, DCBS, True ) # return values (last value indicates new values)
def addSupportingAnnotationsForGenome(args, overwrite=False): #def setSpeciesProperty(taxId, propName, propVal, source, overwrite=True): propSource = "Manual entry; {}; {}".format(getuser(), datetime.now().isoformat(' ')) assert (os.path.exists(args.genome)) setSpeciesProperty(args.taxid, "genome-seq-path", os.path.abspath(args.genome), propSource, overwrite=overwrite) assert (os.path.exists(args.gff3)) setSpeciesProperty(args.taxid, "genome-annot-path", os.path.abspath(args.gff3), propSource, overwrite=overwrite) setSpeciesProperty(args.taxid, "genome-annot-variant", args.variant, propSource, overwrite=overwrite)
def annotateCUBmeasures(taxId, overwrite=False): caiPropValue = getSpeciesProperty(taxId, 'genomic-CAI') # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY # ###return (taxId, 0.0, 1.0, False) # return old values (last value indicates this value are old) # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY # if (caiPropValue[0] is None) or overwrite: cubDf = calculateGenomeLevelCUBmeasures(taxId) print(cubDf) #print(cubDf.index) #print(cubDf.iloc[0].at['CAI']) CAI = cubDf.iloc[0].at['CAI'] CBI = cubDf.iloc[0].at['CBI'] Fop = cubDf.iloc[0].at['Fop'] Nc = cubDf.iloc[0].at['Nc'] assert (CAI < 1.0 and CAI > 0.0) assert (CBI < 1.0 and CBI > -0.5) assert (Fop < 1.0 and Fop > 0.0) assert ( Nc < 75.0 and Nc > 10.0 ) # The actual extreme values for ENc are not clear to me, but let's do a sanity check print(CAI, CBI, Fop, Nc) setSpeciesProperty(taxId, 'genomic-CAI', "{:.4}".format(CAI), "codonw 1.4.4") setSpeciesProperty(taxId, 'genomic-CBI', "{:.4}".format(CBI), "codonw 1.4.4") setSpeciesProperty(taxId, 'genomic-Fop', "{:.4}".format(Fop), "codonw 1.4.4") setSpeciesProperty(taxId, 'genomic-Nc-codonw', "{:.4}".format(Nc), "codonw 1.4.4") else: return ( taxId, caiPropValue[0], False ) # return old values (last value indicates this value are old) return (taxId, CAI, True ) # return values (last value indicates new values)
def testAll(): testGettingGenomeAttributes(10796, "Archaea") testGettingGenomeAttributes(15, "Eukaryota") testGettingGenomeAttributes(1059, "Archaea") testGettingGenomeAttributes(1030, "Bacteria") testGettingGenomeAttributes(1070, "Bacteria") testGettingGenomeAttributes(1564, "Archaea") testGettingGenomeAttributes(1589, "Bacteria") testGettingGenomeAttributes(1124, "Bacteria") testGettingGenomeAttributes(820, "Bacteria") testGettingGenomeAttributes(1069, "Bacteria") testGettingGenomeAttributes(410, "Eukaryota") testGettingGenomeAttributes(691, "Bacteria") testGettingGenomeAttributes(815, "Bacteria") testGettingGenomeAttributes(416, "Bacteria") testGettingGenomeAttributes(1014, "Bacteria") print("---------------------------------------------") totalCount = 0 envFoundCount = 0 tempFoundCount = 0 statsFoundCount = 0 temps1 = {} temps2 = {} oxygenReq = {} habitat = {} salinity = {} proteinCount = {} gcContent = {} genomeSize = {} for taxId in allSpeciesSource(): if limitSpecies and taxId not in limitSpecies: continue genomesList = taxIdToGenomeId(taxId) if (not genomesList): print("No genome-id found for (taxId=%d), skipping..." % taxId) continue kingdom = getKingdomForSpecies(taxId) genomeId = genomesList[0] # TODO - is this right? props = testGettingGenomeAttributes(genomeId, kingdom) tempFound = False envFound = False statsFound = False if 'Environment:' in props: envFound = True envprops = props['Environment:'] if 'TemperatureRange' in envprops: tempFound = True temps1[taxId] = envprops['TemperatureRange'] if 'OptimumTemperature' in envprops: tempFound = True temps2[taxId] = envprops['OptimumTemperature'] if 'OxygenReq' in envprops: oxygenReq[taxId] = envprops['OxygenReq'] if 'Salinity' in envprops: salinity[taxId] = envprops['Salinity'] if 'Habitat' in envprops: habitat[taxId] = envprops['Habitat'] else: envFound = False if 'Statistics:' in props: statsFound = True stats = props['Statistics:'] if 'protein count' in stats: proteinCount[taxId] = stats['protein count'] if 'GC%' in stats: gcContent[taxId] = stats['GC%'] if 'total length (Mb)' in stats: genomeSize[taxId] = stats['total length (Mb)'] else: statsFound = False totalCount += 1 if envFound: envFoundCount += 1 if tempFound: tempFoundCount += 1 if statsFound: statsFoundCount += 1 print("TemperatureRange") print(temps1) print("OptimumTemperature") print(temps2) print("Salinity") print(salinity) print("Habitat") print(habitat) print("OxygenReq") print(oxygenReq) print("ProteinCount") print(proteinCount) print("GC%") print(gcContent) print("genomeSize") print(genomeSize) print("Total: %d\tEnv found: %d\tTemp found: %d\tStats found: %d" % (totalCount, envFoundCount, tempFoundCount, statsFoundCount)) x = {} for k, v in temps2.items(): if type(v) == type(''): if v == 'C': v = None elif v[-1] == 'C': v = int(v[:-1]) else: v = None print("Unknown val %s" % v) elif type(v) == type(()): if len(v) == 2: v = (float(v[0]) + float(v[1])) / 2 else: v = None print("Uknown val %s" % v) if not v is None: x[k] = v print(x) for taxId, temperature in x.items(): setSpeciesProperty(taxId, 'optimum-temperature', '%g' % temperature, "entrez", overwrite=False) for taxId, tempRange in temps1.items(): setSpeciesProperty(taxId, 'temperature-range', tempRange, "entrez", overwrite=False) for taxId, val in salinity.items(): if val == 'Unknown': continue setSpeciesProperty(taxId, 'salinity', val, "entrez", overwrite=False) for taxId, val in habitat.items(): if val == 'Unknown': continue setSpeciesProperty(taxId, 'habitat', val, "entrez", overwrite=False) for taxId, val in oxygenReq.items(): if val == 'Unknown': continue setSpeciesProperty(taxId, 'oxygen-req', val, "entrez", overwrite=False) for taxId, val in proteinCount.items(): setSpeciesProperty(taxId, 'protein-count', val, "entrez", overwrite=False) for taxId, val in gcContent.items(): if (val > 90 or val < 10): continue if (setSpeciesProperty(taxId, 'gc-content', "%g" % val, "entrez", overwrite=False)): print("[gc-content (taxid=%d) -> %g]" % (taxId, val)) for taxId, val in genomeSize.items(): setSpeciesProperty(taxId, 'genome-size-mb', "%g" % val, "entrez", overwrite=False) return 0 genomeIdentifiers = taxIdToGenomeId( 3055) # Obtain genome-ids for this tax-id for genomeId in genomeIdentifiers: report = fetchEntrezGenomeReportForSpecies(genomeId) props = parseNCBIGenomeHTML_fetchSummaryReport(report) print(props) #return 0 #------------ #for fn in ("NCBI_genome_1030.html", "NCBI_genome_1070.html", "NCBI_genome_1347.html"): # with open(fn, "r") as f: # print("Testing %s..." % fn) # props = parseNCBIGenomeHTML_fetchSummaryReport(f.read()) # print(props) for fn in ("NCBI_genomes_report_15_table.txt", ): with open(fn, "r") as f: print("Testing %s..." % fn) (genomeId, assemblyId) = parseNCBIGenomeAssembliesHTML_fetchMainAssembly( fixMissingImgCloseTags(wrapTableFragmentAsXML(f.read()))) print((genomeId, assemblyId)) return 0
#{'Accept2 (repeat)': '0', '#Vieira_Silva_Species': 'Yersinia pestis', 'Internal_species': 'Aspergillus niger', 'TieWarning': 'True', 'CheckConcensus': '0', 'GrowthTime': '1.25', 'Accept (manual decision \xe2\x80\x93 store into redis yes/no)': '0', 'Internal_taxid': '5061'} rowAcceptYesNo = 'Accept (manual decision \xe2\x80\x93 store into redis yes/no)' rowTaxId = 'Internal_taxid' rowGrowthTime = 'GrowthTime' numAcceptedRows = 0 with open( viera_silva_supp_tableA1_mapping_filename, "r") as csvfile: for row in csv.DictReader(csvfile, delimiter="\t"): assert(len(row)==8) if int(row[rowAcceptYesNo]) != 1: continue growthTimeHours = float(row[rowGrowthTime]) # make sure value is valid floating-point number (however original string repr. will be stored) taxId = int(row[rowTaxId]) if not dryRun: setSpeciesProperty( taxId, "growth-time-hours", row[rowGrowthTime], "Vieira-Silva table A1", overwrite=overwriteValues ) else: print( taxId, "growth-time-hours", row[rowGrowthTime], "Vieira-Silva table A1", overwriteValues ) numAcceptedRows += 1 print("numAcceptedRows: {}".format(numAcceptedRows))
def runDistributed(): import _distributed import dask scheduler = _distributed.open() results = {} #taxids = [] delayedCalls = [] fractionSize = 20 for taxId in allSpeciesSource(): if randint(0, 20) > 0: continue if not getSpeciesProperty(taxId, 'paired-mRNA-fraction')[0] is None: continue size = countSpeciesCDS(taxId) numFractions = size / fractionSize for i in range(numFractions): call = dask.delayed(calcNativePairedFraction)(taxId, i, numFractions) delayedCalls.append(call) #taxids.append(taxId) print("Starting %d calls..." % len(delayedCalls)) futures = scheduler.compute( delayedCalls ) # submit all delayed calculations; obtain futures immediately try: _distributed.progress(futures) # wait for all calculations to complete except Exception as e: print(E) print("\n") print("Waiting for all tasks to complete...") _distributed.wait(futures) results = {} errorsCount = 0 for f in futures: try: (taxId, fraction, cdsCount, countPairedNucleotides, countTotalNucleotides) = scheduler.gather(f) current = None if taxId in results: current = results[taxId] else: current = (0, 0, 0, set()) current = (current[0] + cdsCount, current[1] + countPairedNucleotides, current[2] + countTotalNucleotides, current[3].union(set((fraction, )))) results[taxId] = current except Exception as e: print(e) errorsCount += 1 for taxId, result in results.items(): if len(result[3]) != max(result[3]) + 1: #raise Exception("Found invalid number of items for taxId=%d" % taxId) print("Found invalid number of items for taxId=%d" % taxId) continue fraction = float(result[1]) / result[2] setSpeciesProperty(taxId, "paired-mRNA-fraction", "%.4g" % fraction, "computed (v3)", overwrite=False) print("TaxId: %d\t\tFraction: %.4g" % (taxId, fraction)) print("Finished %d species with %d errors" % (len(results), errorsCount)) return results