def runDistributed(): import _distributed import dask scheduler = _distributed.open() delayedCalls = [] for taxId in allSpeciesSource(): if not getSpeciesProperty(taxId, "ENc-prime")[0] is None: continue print(taxId) call = dask.delayed(annotateENcPrime)(taxId) delayedCalls.append(call) print("Starting %d calls..." % len(delayedCalls)) futures = scheduler.compute( delayedCalls ) # submit all delayed calculations; obtain futures immediately try: _distributed.progress(futures) # wait for all calculations to complete except Exception as e: print(E) print("\n") print("Waiting for all tasks to complete...") _distributed.wait(futures) results = {} errorsCount = 0 newValuesCount = 0 oldValuesCount = 0 for f in futures: try: (taxId, ENc, ENc_prime, isFreshValue) = scheduler.gather(f) results[taxId] = (ENc, ENc_prime) if isFreshValue: newValuesCount += 1 else: oldValuesCount += 1 except Exception as e: print(e) errorsCount += 1 print("Finished %d species with %d errors" % (len(results), errorsCount)) print("{} new values; {} old values".format(newValuesCount, oldValuesCount)) return results
def calcFastaProfiles(fastaInput): import dask delayedCalls = [] for (seqId, seq) in fastaSequencesSource(fastaInput): #if random.randint(0, 100) > 0: # continue call = dask.delayed(calculateNativeProfileForSequence)(seqId, seq, 31, 1) delayedCalls.append(call) print("Starting %d calls..." % len(delayedCalls)) futures = scheduler.compute( delayedCalls ) # submit all delayed calculations; obtain futures immediately try: _distributed.progress(futures) # wait for all calculations to complete except Exception as e: print(E) print("\n") print("Waiting for all tasks to complete...") _distributed.wait(futures) errorsCount = 0 results = [] for f in futures: try: (seqId, profile) = scheduler.gather(f) results.append((seqId, profile)) except Exception as e: print(e) errorsCount += 1 with open("{}.profiles.csv".format(fastaInput), "wb") as csvfile: csvout = csv.writer(csvfile) for (seqId, profile) in results: csvout.writerow([seqId] + profile) if errorsCount: print("==" * 20) print("Finished with %d errors!" % errorsCount) print("==" * 20)
def runDistributed(args): import _distributed import dask scheduler = _distributed.open() results = {} taxids = [] delayedCalls = [] for taxid in args.taxid: call = dask.delayed(calcProfilesForSpeciesX)(taxid, args) delayedCalls.append(call) taxids.append(taxid) futures = scheduler.compute( delayedCalls ) # submit all delayed calculations; obtain futures immediately try: _distributed.progress(futures) # wait for all calculations to complete except Exception as e: print(E) print("\n") print("Waiting for all tasks to complete...") _distributed.wait(futures) results = {} errorsCount = 0 for taxid, f in zip(taxids, futures): try: r = scheduler.gather(f) returnedTaxId = r[0] assert (taxid == returnedTaxId) results[taxid] = r except Exception as e: print(e) results[taxid] = None errorsCount += 1 print("Finished with %d errors" % errorsCount) return results
def runDistributed(): import _distributed import dask scheduler = _distributed.open() results = {} #taxids = [] delayedCalls = [] fractionSize = 20 for taxId in allSpeciesSource(): if randint(0, 20) > 0: continue if not getSpeciesProperty(taxId, 'paired-mRNA-fraction')[0] is None: continue size = countSpeciesCDS(taxId) numFractions = size / fractionSize for i in range(numFractions): call = dask.delayed(calcNativePairedFraction)(taxId, i, numFractions) delayedCalls.append(call) #taxids.append(taxId) print("Starting %d calls..." % len(delayedCalls)) futures = scheduler.compute( delayedCalls ) # submit all delayed calculations; obtain futures immediately try: _distributed.progress(futures) # wait for all calculations to complete except Exception as e: print(E) print("\n") print("Waiting for all tasks to complete...") _distributed.wait(futures) results = {} errorsCount = 0 for f in futures: try: (taxId, fraction, cdsCount, countPairedNucleotides, countTotalNucleotides) = scheduler.gather(f) current = None if taxId in results: current = results[taxId] else: current = (0, 0, 0, set()) current = (current[0] + cdsCount, current[1] + countPairedNucleotides, current[2] + countTotalNucleotides, current[3].union(set((fraction, )))) results[taxId] = current except Exception as e: print(e) errorsCount += 1 for taxId, result in results.items(): if len(result[3]) != max(result[3]) + 1: #raise Exception("Found invalid number of items for taxId=%d" % taxId) print("Found invalid number of items for taxId=%d" % taxId) continue fraction = float(result[1]) / result[2] setSpeciesProperty(taxId, "paired-mRNA-fraction", "%.4g" % fraction, "computed (v3)", overwrite=False) print("TaxId: %d\t\tFraction: %.4g" % (taxId, fraction)) print("Finished %d species with %d errors" % (len(results), errorsCount)) return results
def speciesStatisticsAndValidityReport(args): import _distributed speciesDf = pd.DataFrame({ 'TaxId': pd.Series([], dtype='int'), # Species TaxId 'Species': pd.Series([], dtype='str'), # Species binomial name 'Nickname': pd.Series([], dtype='str'), 'Domain': pd.Categorical([]), # Bacteria, Eukaryota, Archaea 'Phylum': pd.Categorical([]), # Phylum name (string) 'NumCDSs': pd.Series([], dtype='int'), # CDS count for this species 'NumCDSsInProfile': pd.Series([], dtype='int' ), # Num seqs with 20 shuffled profiles for this species 'AnnotatedNumCDSs': pd.Series([], dtype='int'), # 'CDSDifference': pd.Series([], dtype='float'), # 'NumNativeSeqs': pd.Series([], dtype='int'), # 'GCContentInCDS': pd.Series([], dtype='float'), # 'AnnotatedGCContent': pd.Series([], dtype='float'), # 'RowType': pd.Categorical([]), # Species count or total 'Warnings': pd.Series([], dtype='str'), # 'CDSWarnings': pd.Series([], dtype='int'), # 'CDSWarnings_': pd.Series([], dtype='str'), # 'FirstAA': pd.Series([], dtype='str'), # 'LastAA': pd.Series([], dtype='str') # }) scheduler = _distributed.open() results = {} delayedCalls_native = [] shuffledCounts = {} delayedCalls_shuffledProfiles = [] for taxId in allSpeciesSource(): if taxId in speciesToExclude: continue # always exclude species from the blacklist if args.taxid and taxId not in args.taxid: continue # if a whitelist is specified, skip other species warnings = [] ## DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ## #if randint(0, 20) > 0: # continue ## DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ## cdsCountInRedis = countSpeciesCDS(taxId) #cdsCountProfiles = countx(taxId, (310, 10, "begin", 0), 102, 11) annotatedProteinCount = getSpeciesProperty(taxId, 'protein-count')[0] annotatedGCContent = getSpeciesProperty(taxId, 'gc-content')[0] proteinDifference = None if not annotatedProteinCount is None: proteinDifference = (1.0 - float(cdsCountInRedis) / float(annotatedProteinCount)) * 100.0 if abs(proteinDifference) > 9.9: warnings.append("CDS_count") else: warnings.append("No_CDS_count") # Determine phylum lineage = ncbiTaxa.get_lineage(taxId) names = ncbiTaxa.get_taxid_translator(lineage) ranks = ncbiTaxa.get_rank(lineage) # Determine kingdom/domain domain = "" kingdomTaxId = [ t for t, rank in ranks.items() if rank == 'superkingdom' ] if not kingdomTaxId: kingdomTaxId = [ t for t, rank in ranks.items() if rank == 'kingdom' ] domain = names[kingdomTaxId[0]] phylumName = "" # Determine phylum phylumTaxId = [t for t, rank in ranks.items() if rank == 'phylum'] if phylumTaxId: phylumName = names[phylumTaxId[0]] speciesDf = speciesDf.append( pd.DataFrame({ 'TaxId': pd.Series([taxId], dtype='int'), # Species TaxId 'Species': pd.Series([getSpeciesName(taxId)], dtype='str'), 'Nickname': pd.Series([shortNames[taxId]], dtype='str'), 'Domain': pd.Categorical([domain]), # Bacteria, Eukaryota, Archaea 'Phylum': pd.Categorical([phylumName]), # Phylum name (string) 'NumCDSs': pd.Series([cdsCountInRedis], dtype='int'), # CDS count for this species 'NumCDSsInProfile': pd.Series([0], dtype='int'), # Num seqs with 20 shuffled profiles 'AnnotatedNumCDSs': pd.Series([ 0 if annotatedProteinCount is None else annotatedProteinCount ], dtype='int'), # 'CDSDifference': pd.Series([proteinDifference], dtype='float'), # 'NumNativeSeqs': pd.Series([0], dtype='int'), # 'GCContentInCDS': pd.Series([0.0], dtype='float'), # 'AnnotatedGCContent': pd.Series([annotatedGCContent], dtype='float'), # 'RowType': pd.Categorical(["species"]), # Species count or total 'Warnings': pd.Series([", ".join(warnings)], dtype='str'), # 'CDSWarnings': pd.Series([0], dtype='int'), 'CDSWarnings_': pd.Series([""], dtype='str'), 'FirstAA': pd.Series([""], dtype='str'), 'LastAA': pd.Series([""], dtype='str'), 'Source': pd.Series([""], dtype='str') })) fractionSize = 1000 # How many sequences (roughly) to process in each task numFractions = cdsCountInRedis / fractionSize if numFractions == 0: numFractions = 1 for i in range(numFractions): # DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY # #if i%100!=5: continue # DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY # call = dask.delayed(calcNativeSequencesStatistics)(taxId, i, numFractions) delayedCalls_native.append(call) call = dask.delayed(countShuffledProfiles)(taxId, (310, 10, "begin", 0), 102, 11) delayedCalls_shuffledProfiles.append(call) speciesDf.set_index('TaxId', inplace=True) print("Starting {} calls...".format( len(delayedCalls_native) + len(delayedCalls_shuffledProfiles))) futures = scheduler.compute( delayedCalls_native + delayedCalls_shuffledProfiles ) # submit all delayed calculations; obtain futures immediately try: _distributed.progress(futures) # wait for all calculations to complete except Exception as e: print(E) print("\n") print("Waiting for all tasks to complete...") _distributed.wait(futures) results = {} errorsCount = 0 for f in futures: try: ret = scheduler.gather(f) if (len(ret) == 9): (taxId, fraction, cdsCount, gcCounts, totalCounts, cdsWarnings, warnings, firstAA, lastAA) = ret current = None if taxId in results: current = results[taxId] else: current = (0, 0, 0, 0, Counter(), Counter(), Counter()) current = (current[0] + cdsCount, current[1] + gcCounts, current[2] + totalCounts, current[3] + cdsWarnings, current[4] + warnings, current[5] + firstAA, current[6] + lastAA) results[taxId] = current elif (len(ret) == 2): (taxId, numShuffledSeqs) = ret shuffledCounts[taxId] = numShuffledSeqs else: assert (False) except Exception as e: print(e) errorsCount += 1 for taxId, result in results.items(): (numNativeSeqs, gcCounts, totalCounts, cdsWarnings, warnings, firstAA, lastAA) = result speciesDf.at[taxId, 'NumNativeSeqs'] = numNativeSeqs speciesDf.at[taxId, 'GCContentInCDS'] = round( float(gcCounts) / float(totalCounts) * 100.0, 1) speciesDf.at[taxId, 'CDSWarnings'] = cdsWarnings speciesDf.at[taxId, 'CDSWarnings_'] = summarizeCounter(warnings) speciesDf.at[taxId, 'FirstAA'] = summarizeCounter(firstAA) speciesDf.at[taxId, 'LastAA'] = summarizeCounter(lastAA) #if numNativeSeqs < species.at[taxId, 'NumCDSs']: # pass for taxId, result in shuffledCounts.items(): speciesDf.at[taxId, 'NumCDSsInProfile'] = result speciesDf = speciesDf.sort_values(by=['Domain', 'Species']) # sort rows speciesDf.to_html('species_report.html', float_format='{0:.1f}'.format, columns=[ 'Species', 'Nickname', 'NumCDSs', 'NumCDSsInProfile', 'AnnotatedNumCDSs', 'CDSDifference', 'NumNativeSeqs', 'GCContentInCDS', 'AnnotatedGCContent', 'Phylum', 'Domain', 'Warnings', 'CDSWarnings', 'CDSWarnings_', 'FirstAA', 'LastAA' ]) with open("species_report_simple.rst", "w") as f: f.write( speciesDf.drop([ 'RowType', 'Warnings', 'CDSWarnings', 'CDSWarnings_', 'FirstAA', 'LastAA', 'CDSDifference' ], axis=1).pipe(tabulate, headers='keys', tablefmt='rst')) speciesDf.to_html('species_report_simple.html', float_format='{0:.1f}'.format, columns=[ 'Species', 'Nickname', 'NumCDSs', 'NumCDSsInProfile', 'AnnotatedNumCDSs', 'CDSDifference', 'NumNativeSeqs', 'GCContentInCDS', 'AnnotatedGCContent', 'Phylum', 'Domain' ]) speciesDf.to_excel('species_report.xlsx', sheet_name='Species summary')