Beispiel #1
0
def runDistributed():
    import _distributed
    import dask

    scheduler = _distributed.open()
    delayedCalls = []

    for taxId in allSpeciesSource():

        if not getSpeciesProperty(taxId, "ENc-prime")[0] is None:
            continue

        print(taxId)

        call = dask.delayed(annotateENcPrime)(taxId)
        delayedCalls.append(call)

    print("Starting %d calls..." % len(delayedCalls))

    futures = scheduler.compute(
        delayedCalls
    )  # submit all delayed calculations; obtain futures immediately

    try:
        _distributed.progress(futures)  # wait for all calculations to complete
    except Exception as e:
        print(E)
    print("\n")

    print("Waiting for all tasks to complete...")
    _distributed.wait(futures)

    results = {}
    errorsCount = 0
    newValuesCount = 0
    oldValuesCount = 0
    for f in futures:
        try:
            (taxId, ENc, ENc_prime, isFreshValue) = scheduler.gather(f)
            results[taxId] = (ENc, ENc_prime)
            if isFreshValue:
                newValuesCount += 1
            else:
                oldValuesCount += 1

        except Exception as e:
            print(e)
            errorsCount += 1

    print("Finished %d species with %d errors" % (len(results), errorsCount))
    print("{} new values; {} old values".format(newValuesCount,
                                                oldValuesCount))
    return results
def calcFastaProfiles(fastaInput):
    import dask

    delayedCalls = []

    for (seqId, seq) in fastaSequencesSource(fastaInput):

        #if random.randint(0, 100) > 0:
        #    continue

        call = dask.delayed(calculateNativeProfileForSequence)(seqId, seq, 31,
                                                               1)
        delayedCalls.append(call)

    print("Starting %d calls..." % len(delayedCalls))

    futures = scheduler.compute(
        delayedCalls
    )  # submit all delayed calculations; obtain futures immediately

    try:
        _distributed.progress(futures)  # wait for all calculations to complete
    except Exception as e:
        print(E)
    print("\n")

    print("Waiting for all tasks to complete...")
    _distributed.wait(futures)

    errorsCount = 0

    results = []

    for f in futures:
        try:
            (seqId, profile) = scheduler.gather(f)

            results.append((seqId, profile))

        except Exception as e:
            print(e)
            errorsCount += 1

    with open("{}.profiles.csv".format(fastaInput), "wb") as csvfile:
        csvout = csv.writer(csvfile)
        for (seqId, profile) in results:
            csvout.writerow([seqId] + profile)

    if errorsCount:
        print("==" * 20)
        print("Finished with %d errors!" % errorsCount)
        print("==" * 20)
def runDistributed(args):
    import _distributed
    import dask

    scheduler = _distributed.open()

    results = {}

    taxids = []
    delayedCalls = []

    for taxid in args.taxid:
        call = dask.delayed(calcProfilesForSpeciesX)(taxid, args)
        delayedCalls.append(call)
        taxids.append(taxid)

    futures = scheduler.compute(
        delayedCalls
    )  # submit all delayed calculations; obtain futures immediately

    try:
        _distributed.progress(futures)  # wait for all calculations to complete
    except Exception as e:
        print(E)
    print("\n")

    print("Waiting for all tasks to complete...")
    _distributed.wait(futures)

    results = {}
    errorsCount = 0
    for taxid, f in zip(taxids, futures):
        try:
            r = scheduler.gather(f)
            returnedTaxId = r[0]
            assert (taxid == returnedTaxId)
            results[taxid] = r

        except Exception as e:
            print(e)
            results[taxid] = None
            errorsCount += 1

    print("Finished with %d errors" % errorsCount)
    return results
def runDistributed():
    import _distributed
    import dask

    scheduler = _distributed.open()

    results = {}

    #taxids = []
    delayedCalls = []

    fractionSize = 20

    for taxId in allSpeciesSource():

        if randint(0, 20) > 0:
            continue

        if not getSpeciesProperty(taxId, 'paired-mRNA-fraction')[0] is None:
            continue

        size = countSpeciesCDS(taxId)

        numFractions = size / fractionSize
        for i in range(numFractions):
            call = dask.delayed(calcNativePairedFraction)(taxId, i,
                                                          numFractions)
            delayedCalls.append(call)
            #taxids.append(taxId)

    print("Starting %d calls..." % len(delayedCalls))

    futures = scheduler.compute(
        delayedCalls
    )  # submit all delayed calculations; obtain futures immediately

    try:
        _distributed.progress(futures)  # wait for all calculations to complete
    except Exception as e:
        print(E)
    print("\n")

    print("Waiting for all tasks to complete...")
    _distributed.wait(futures)

    results = {}
    errorsCount = 0
    for f in futures:
        try:
            (taxId, fraction, cdsCount, countPairedNucleotides,
             countTotalNucleotides) = scheduler.gather(f)

            current = None
            if taxId in results:
                current = results[taxId]
            else:
                current = (0, 0, 0, set())

            current = (current[0] + cdsCount,
                       current[1] + countPairedNucleotides,
                       current[2] + countTotalNucleotides,
                       current[3].union(set((fraction, ))))

            results[taxId] = current

        except Exception as e:
            print(e)
            errorsCount += 1

    for taxId, result in results.items():
        if len(result[3]) != max(result[3]) + 1:
            #raise Exception("Found invalid number of items for taxId=%d" % taxId)
            print("Found invalid number of items for taxId=%d" % taxId)
            continue

        fraction = float(result[1]) / result[2]

        setSpeciesProperty(taxId,
                           "paired-mRNA-fraction",
                           "%.4g" % fraction,
                           "computed (v3)",
                           overwrite=False)

        print("TaxId: %d\t\tFraction: %.4g" % (taxId, fraction))

    print("Finished %d species with %d errors" % (len(results), errorsCount))
    return results
Beispiel #5
0
def speciesStatisticsAndValidityReport(args):
    import _distributed

    speciesDf = pd.DataFrame({
        'TaxId': pd.Series([], dtype='int'),  # Species TaxId
        'Species': pd.Series([], dtype='str'),  # Species binomial name
        'Nickname': pd.Series([], dtype='str'),
        'Domain': pd.Categorical([]),  # Bacteria, Eukaryota, Archaea
        'Phylum': pd.Categorical([]),  # Phylum name (string)
        'NumCDSs': pd.Series([], dtype='int'),  # CDS count for this species
        'NumCDSsInProfile':
        pd.Series([], dtype='int'
                  ),  # Num seqs with 20 shuffled profiles for this species
        'AnnotatedNumCDSs': pd.Series([], dtype='int'),  # 
        'CDSDifference': pd.Series([], dtype='float'),  # 
        'NumNativeSeqs': pd.Series([], dtype='int'),  # 
        'GCContentInCDS': pd.Series([], dtype='float'),  # 
        'AnnotatedGCContent': pd.Series([], dtype='float'),  # 
        'RowType': pd.Categorical([]),  # Species count or total
        'Warnings': pd.Series([], dtype='str'),  # 
        'CDSWarnings': pd.Series([], dtype='int'),  # 
        'CDSWarnings_': pd.Series([], dtype='str'),  # 
        'FirstAA': pd.Series([], dtype='str'),  # 
        'LastAA': pd.Series([], dtype='str')  # 
    })

    scheduler = _distributed.open()

    results = {}
    delayedCalls_native = []

    shuffledCounts = {}
    delayedCalls_shuffledProfiles = []

    for taxId in allSpeciesSource():
        if taxId in speciesToExclude:
            continue  # always exclude species from the blacklist
        if args.taxid and taxId not in args.taxid:
            continue  # if a whitelist is specified, skip other species

        warnings = []

        ## DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ##
        #if randint(0, 20) > 0:
        #    continue
        ## DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ##

        cdsCountInRedis = countSpeciesCDS(taxId)

        #cdsCountProfiles = countx(taxId, (310, 10, "begin", 0), 102, 11)

        annotatedProteinCount = getSpeciesProperty(taxId, 'protein-count')[0]

        annotatedGCContent = getSpeciesProperty(taxId, 'gc-content')[0]

        proteinDifference = None
        if not annotatedProteinCount is None:
            proteinDifference = (1.0 - float(cdsCountInRedis) /
                                 float(annotatedProteinCount)) * 100.0

            if abs(proteinDifference) > 9.9:
                warnings.append("CDS_count")
        else:
            warnings.append("No_CDS_count")

        # Determine phylum
        lineage = ncbiTaxa.get_lineage(taxId)
        names = ncbiTaxa.get_taxid_translator(lineage)

        ranks = ncbiTaxa.get_rank(lineage)

        # Determine kingdom/domain
        domain = ""
        kingdomTaxId = [
            t for t, rank in ranks.items() if rank == 'superkingdom'
        ]
        if not kingdomTaxId:
            kingdomTaxId = [
                t for t, rank in ranks.items() if rank == 'kingdom'
            ]
        domain = names[kingdomTaxId[0]]

        phylumName = ""
        # Determine phylum
        phylumTaxId = [t for t, rank in ranks.items() if rank == 'phylum']
        if phylumTaxId:
            phylumName = names[phylumTaxId[0]]

        speciesDf = speciesDf.append(
            pd.DataFrame({
                'TaxId':
                pd.Series([taxId], dtype='int'),  # Species TaxId
                'Species':
                pd.Series([getSpeciesName(taxId)], dtype='str'),
                'Nickname':
                pd.Series([shortNames[taxId]], dtype='str'),
                'Domain':
                pd.Categorical([domain]),  # Bacteria, Eukaryota, Archaea
                'Phylum':
                pd.Categorical([phylumName]),  # Phylum name (string)
                'NumCDSs':
                pd.Series([cdsCountInRedis],
                          dtype='int'),  # CDS count for this species
                'NumCDSsInProfile':
                pd.Series([0],
                          dtype='int'),  # Num seqs with 20 shuffled profiles
                'AnnotatedNumCDSs':
                pd.Series([
                    0
                    if annotatedProteinCount is None else annotatedProteinCount
                ],
                          dtype='int'),  # 
                'CDSDifference':
                pd.Series([proteinDifference], dtype='float'),  # 
                'NumNativeSeqs':
                pd.Series([0], dtype='int'),  # 
                'GCContentInCDS':
                pd.Series([0.0], dtype='float'),  # 
                'AnnotatedGCContent':
                pd.Series([annotatedGCContent], dtype='float'),  # 
                'RowType':
                pd.Categorical(["species"]),  # Species count or total
                'Warnings':
                pd.Series([", ".join(warnings)], dtype='str'),  #
                'CDSWarnings':
                pd.Series([0], dtype='int'),
                'CDSWarnings_':
                pd.Series([""], dtype='str'),
                'FirstAA':
                pd.Series([""], dtype='str'),
                'LastAA':
                pd.Series([""], dtype='str'),
                'Source':
                pd.Series([""], dtype='str')
            }))

        fractionSize = 1000  # How many sequences (roughly) to process in each task
        numFractions = cdsCountInRedis / fractionSize
        if numFractions == 0: numFractions = 1

        for i in range(numFractions):
            # DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #
            #if i%100!=5: continue
            # DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #

            call = dask.delayed(calcNativeSequencesStatistics)(taxId, i,
                                                               numFractions)
            delayedCalls_native.append(call)

        call = dask.delayed(countShuffledProfiles)(taxId,
                                                   (310, 10, "begin", 0), 102,
                                                   11)
        delayedCalls_shuffledProfiles.append(call)

    speciesDf.set_index('TaxId', inplace=True)

    print("Starting {} calls...".format(
        len(delayedCalls_native) + len(delayedCalls_shuffledProfiles)))

    futures = scheduler.compute(
        delayedCalls_native + delayedCalls_shuffledProfiles
    )  # submit all delayed calculations; obtain futures immediately

    try:
        _distributed.progress(futures)  # wait for all calculations to complete
    except Exception as e:
        print(E)
    print("\n")

    print("Waiting for all tasks to complete...")
    _distributed.wait(futures)

    results = {}

    errorsCount = 0
    for f in futures:
        try:
            ret = scheduler.gather(f)
            if (len(ret) == 9):
                (taxId, fraction, cdsCount, gcCounts, totalCounts, cdsWarnings,
                 warnings, firstAA, lastAA) = ret

                current = None
                if taxId in results:
                    current = results[taxId]
                else:
                    current = (0, 0, 0, 0, Counter(), Counter(), Counter())

                current = (current[0] + cdsCount, current[1] + gcCounts,
                           current[2] + totalCounts, current[3] + cdsWarnings,
                           current[4] + warnings, current[5] + firstAA,
                           current[6] + lastAA)

                results[taxId] = current

            elif (len(ret) == 2):
                (taxId, numShuffledSeqs) = ret
                shuffledCounts[taxId] = numShuffledSeqs

            else:
                assert (False)

        except Exception as e:
            print(e)
            errorsCount += 1

    for taxId, result in results.items():
        (numNativeSeqs, gcCounts, totalCounts, cdsWarnings, warnings, firstAA,
         lastAA) = result
        speciesDf.at[taxId, 'NumNativeSeqs'] = numNativeSeqs

        speciesDf.at[taxId, 'GCContentInCDS'] = round(
            float(gcCounts) / float(totalCounts) * 100.0, 1)

        speciesDf.at[taxId, 'CDSWarnings'] = cdsWarnings

        speciesDf.at[taxId, 'CDSWarnings_'] = summarizeCounter(warnings)
        speciesDf.at[taxId, 'FirstAA'] = summarizeCounter(firstAA)
        speciesDf.at[taxId, 'LastAA'] = summarizeCounter(lastAA)

        #if numNativeSeqs < species.at[taxId, 'NumCDSs']:
        #    pass

    for taxId, result in shuffledCounts.items():
        speciesDf.at[taxId, 'NumCDSsInProfile'] = result

    speciesDf = speciesDf.sort_values(by=['Domain', 'Species'])  # sort rows
    speciesDf.to_html('species_report.html',
                      float_format='{0:.1f}'.format,
                      columns=[
                          'Species', 'Nickname', 'NumCDSs', 'NumCDSsInProfile',
                          'AnnotatedNumCDSs', 'CDSDifference', 'NumNativeSeqs',
                          'GCContentInCDS', 'AnnotatedGCContent', 'Phylum',
                          'Domain', 'Warnings', 'CDSWarnings', 'CDSWarnings_',
                          'FirstAA', 'LastAA'
                      ])

    with open("species_report_simple.rst", "w") as f:
        f.write(
            speciesDf.drop([
                'RowType', 'Warnings', 'CDSWarnings', 'CDSWarnings_',
                'FirstAA', 'LastAA', 'CDSDifference'
            ],
                           axis=1).pipe(tabulate,
                                        headers='keys',
                                        tablefmt='rst'))

    speciesDf.to_html('species_report_simple.html',
                      float_format='{0:.1f}'.format,
                      columns=[
                          'Species', 'Nickname', 'NumCDSs', 'NumCDSsInProfile',
                          'AnnotatedNumCDSs', 'CDSDifference', 'NumNativeSeqs',
                          'GCContentInCDS', 'AnnotatedGCContent', 'Phylum',
                          'Domain'
                      ])

    speciesDf.to_excel('species_report.xlsx', sheet_name='Species summary')