Exemple #1
0
def annotateENcPrime(taxId, overwrite=False):
    encPropValue = getSpeciesProperty(taxId, 'ENc')
    encPrimePropValue = getSpeciesProperty(taxId, 'ENc-prime')

    # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY #
    ###return (taxId, 0.0, 1.0, False)  # return old values (last value indicates this value are old)
    # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY #

    if (encPropValue[0] is None) or (encPrimePropValue[0] is
                                     None) or overwrite:
        ENc, ENc_prime = calculateENcPrimeForSpecies(taxId)
        assert (
            ENc < 75.0 and ENc > 10.0
        )  # The actual extreme values for ENc are not clear to me, but let's do a sanity check
        assert (ENc_prime < 75.0 and ENc_prime > 10.0)

        setSpeciesProperty(taxId, 'ENc', str(ENc), "ENCprime (custom version)")
        setSpeciesProperty(taxId, 'ENc-prime', str(ENc_prime),
                           "ENCprime (custom version)")

    else:
        return (
            taxId, encPropValue[0], encPrimePropValue[0], False
        )  # return old values (last value indicates this value are old)

    return (taxId, ENc, ENc_prime, True
            )  # return values (last value indicates new values)
def run():

    positiveDict = dict(positiveGroup)
    negativeDict = dict(negativeGroup)

    totalCount = 0
    positiveCount = 0
    negativeCount = 0

    for taxId in allSpeciesSource():

        totalCount += 1

        #if not getSpeciesProperty(taxId, 'algae')[0] is None:
        #    continue

        lineage = frozenset(ncbiTaxa.get_lineage(taxId))

        algeaClassification = None

        if lineage.intersection(algaeDefinition_ExcludedGroups):
            algeaClassification = ('No', 'Excluded taxonomic group')

        elif taxId in positiveDict:
            algeaClassification = ('Yes', positiveDict[taxId])

        elif taxId in negativeDict:
            algeaClassification = ('No', negativeDict[taxId])

        if not algeaClassification is None:
            setSpeciesProperty(taxId,
                               "algae",
                               algeaClassification[0],
                               algeaClassification[1],
                               overwrite=True)

            # Done; update counts
            if algeaClassification[0] == 'Yes':
                positiveCount += 1

                if lineage.intersection(algaeDefinition_ExcludedGroups):
                    print("Warning: possible false annotation: %d" % taxId)
                if not lineage.intersection(algaeDefinition_IncludedGroups):
                    print("Warning: possible false annotation: %d" % taxId)
            elif algeaClassification[0] == 'No':
                negativeCount += 1
            else:
                assert (False)
        else:
            if lineage.intersection(algaeDefinition_IncludedGroups):
                print("Warning: check unannotated possible algae: %d" % taxId)

    print("Finished %d species (%d annotated; %d positive, %d negative)" %
          (totalCount, positiveCount + negativeCount, positiveCount,
           negativeCount))
def runDistributed():

    for taxId in allSpeciesSource():

        currentProp = getSpeciesProperty(taxId, 'paired-mRNA-fraction')

        if currentProp[0] is None:
            continue

        if currentProp[1] == "computed":
            origVal = float(currentProp[0])
            fixedVal = origVal * 2
            setSpeciesProperty(taxId,
                               "paired-mRNA-fraction",
                               "%.4g" % fixedVal,
                               "computed (v2)",
                               overwrite=True)
            print("Fixed %d: %.4g -> %.4g" % (taxId, origVal, fixedVal))
Exemple #4
0
def annotateDCBS(taxId, overwrite=False):
    dcbsPropValue = getSpeciesProperty(taxId, 'DCBS-geomean')

    # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY #
    ## return (taxId, 0.0, False)  # return old values (last value indicates this value are old)
    # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY #

    if (dcbsPropValue[0] is None) or overwrite:
        DCBS = float(calcDCBS(taxId))
        #assert(ENc_prime<75.0 and ENc_prime>10.0)

        setSpeciesProperty(taxId, 'DCBS-geomean', str(DCBS),
                           "DCBS (matlab, Renana)")

    else:
        return (
            taxId, dcbsPropValue[0], False
        )  # return old values (last value indicates this value are old)

    return (taxId, DCBS, True
            )  # return values (last value indicates new values)
def addSupportingAnnotationsForGenome(args, overwrite=False):
    #def setSpeciesProperty(taxId, propName, propVal, source, overwrite=True):

    propSource = "Manual entry; {}; {}".format(getuser(),
                                               datetime.now().isoformat(' '))

    assert (os.path.exists(args.genome))
    setSpeciesProperty(args.taxid,
                       "genome-seq-path",
                       os.path.abspath(args.genome),
                       propSource,
                       overwrite=overwrite)

    assert (os.path.exists(args.gff3))
    setSpeciesProperty(args.taxid,
                       "genome-annot-path",
                       os.path.abspath(args.gff3),
                       propSource,
                       overwrite=overwrite)

    setSpeciesProperty(args.taxid,
                       "genome-annot-variant",
                       args.variant,
                       propSource,
                       overwrite=overwrite)
Exemple #6
0
def annotateCUBmeasures(taxId, overwrite=False):
    caiPropValue = getSpeciesProperty(taxId, 'genomic-CAI')

    # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY #
    ###return (taxId, 0.0, 1.0, False)  # return old values (last value indicates this value are old)
    # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY #

    if (caiPropValue[0] is None) or overwrite:
        cubDf = calculateGenomeLevelCUBmeasures(taxId)

        print(cubDf)
        #print(cubDf.index)
        #print(cubDf.iloc[0].at['CAI'])

        CAI = cubDf.iloc[0].at['CAI']
        CBI = cubDf.iloc[0].at['CBI']
        Fop = cubDf.iloc[0].at['Fop']
        Nc = cubDf.iloc[0].at['Nc']

        assert (CAI < 1.0 and CAI > 0.0)
        assert (CBI < 1.0 and CBI > -0.5)
        assert (Fop < 1.0 and Fop > 0.0)
        assert (
            Nc < 75.0 and Nc > 10.0
        )  # The actual extreme values for ENc are not clear to me, but let's do a sanity check
        print(CAI, CBI, Fop, Nc)

        setSpeciesProperty(taxId, 'genomic-CAI', "{:.4}".format(CAI),
                           "codonw 1.4.4")
        setSpeciesProperty(taxId, 'genomic-CBI', "{:.4}".format(CBI),
                           "codonw 1.4.4")
        setSpeciesProperty(taxId, 'genomic-Fop', "{:.4}".format(Fop),
                           "codonw 1.4.4")
        setSpeciesProperty(taxId, 'genomic-Nc-codonw', "{:.4}".format(Nc),
                           "codonw 1.4.4")

    else:
        return (
            taxId, caiPropValue[0], False
        )  # return old values (last value indicates this value are old)

    return (taxId, CAI, True
            )  # return values (last value indicates new values)
def testAll():

    testGettingGenomeAttributes(10796, "Archaea")
    testGettingGenomeAttributes(15, "Eukaryota")
    testGettingGenomeAttributes(1059, "Archaea")
    testGettingGenomeAttributes(1030, "Bacteria")
    testGettingGenomeAttributes(1070, "Bacteria")
    testGettingGenomeAttributes(1564, "Archaea")
    testGettingGenomeAttributes(1589, "Bacteria")
    testGettingGenomeAttributes(1124, "Bacteria")
    testGettingGenomeAttributes(820, "Bacteria")
    testGettingGenomeAttributes(1069, "Bacteria")
    testGettingGenomeAttributes(410, "Eukaryota")
    testGettingGenomeAttributes(691, "Bacteria")
    testGettingGenomeAttributes(815, "Bacteria")
    testGettingGenomeAttributes(416, "Bacteria")
    testGettingGenomeAttributes(1014, "Bacteria")

    print("---------------------------------------------")

    totalCount = 0
    envFoundCount = 0
    tempFoundCount = 0
    statsFoundCount = 0

    temps1 = {}
    temps2 = {}
    oxygenReq = {}
    habitat = {}
    salinity = {}
    proteinCount = {}
    gcContent = {}
    genomeSize = {}

    for taxId in allSpeciesSource():
        if limitSpecies and taxId not in limitSpecies:
            continue

        genomesList = taxIdToGenomeId(taxId)
        if (not genomesList):
            print("No genome-id found for (taxId=%d), skipping..." % taxId)
            continue

        kingdom = getKingdomForSpecies(taxId)
        genomeId = genomesList[0]  # TODO - is this right?
        props = testGettingGenomeAttributes(genomeId, kingdom)

        tempFound = False
        envFound = False
        statsFound = False

        if 'Environment:' in props:
            envFound = True
            envprops = props['Environment:']

            if 'TemperatureRange' in envprops:
                tempFound = True
                temps1[taxId] = envprops['TemperatureRange']

            if 'OptimumTemperature' in envprops:
                tempFound = True
                temps2[taxId] = envprops['OptimumTemperature']

            if 'OxygenReq' in envprops:
                oxygenReq[taxId] = envprops['OxygenReq']

            if 'Salinity' in envprops:
                salinity[taxId] = envprops['Salinity']

            if 'Habitat' in envprops:
                habitat[taxId] = envprops['Habitat']

        else:
            envFound = False

        if 'Statistics:' in props:
            statsFound = True
            stats = props['Statistics:']

            if 'protein count' in stats:
                proteinCount[taxId] = stats['protein count']

            if 'GC%' in stats:
                gcContent[taxId] = stats['GC%']

            if 'total length (Mb)' in stats:
                genomeSize[taxId] = stats['total length (Mb)']
        else:
            statsFound = False

        totalCount += 1
        if envFound:
            envFoundCount += 1
        if tempFound:
            tempFoundCount += 1
        if statsFound:
            statsFoundCount += 1

    print("TemperatureRange")
    print(temps1)
    print("OptimumTemperature")
    print(temps2)
    print("Salinity")
    print(salinity)
    print("Habitat")
    print(habitat)
    print("OxygenReq")
    print(oxygenReq)

    print("ProteinCount")
    print(proteinCount)
    print("GC%")
    print(gcContent)
    print("genomeSize")
    print(genomeSize)
    print("Total: %d\tEnv found: %d\tTemp found: %d\tStats found: %d" %
          (totalCount, envFoundCount, tempFoundCount, statsFoundCount))

    x = {}
    for k, v in temps2.items():
        if type(v) == type(''):
            if v == 'C':
                v = None
            elif v[-1] == 'C':
                v = int(v[:-1])
            else:
                v = None
                print("Unknown val %s" % v)
        elif type(v) == type(()):
            if len(v) == 2:
                v = (float(v[0]) + float(v[1])) / 2
            else:
                v = None
                print("Uknown val %s" % v)

        if not v is None:
            x[k] = v
    print(x)

    for taxId, temperature in x.items():
        setSpeciesProperty(taxId,
                           'optimum-temperature',
                           '%g' % temperature,
                           "entrez",
                           overwrite=False)

    for taxId, tempRange in temps1.items():
        setSpeciesProperty(taxId,
                           'temperature-range',
                           tempRange,
                           "entrez",
                           overwrite=False)

    for taxId, val in salinity.items():
        if val == 'Unknown':
            continue
        setSpeciesProperty(taxId, 'salinity', val, "entrez", overwrite=False)

    for taxId, val in habitat.items():
        if val == 'Unknown':
            continue
        setSpeciesProperty(taxId, 'habitat', val, "entrez", overwrite=False)

    for taxId, val in oxygenReq.items():
        if val == 'Unknown':
            continue
        setSpeciesProperty(taxId, 'oxygen-req', val, "entrez", overwrite=False)

    for taxId, val in proteinCount.items():
        setSpeciesProperty(taxId,
                           'protein-count',
                           val,
                           "entrez",
                           overwrite=False)

    for taxId, val in gcContent.items():
        if (val > 90 or val < 10):
            continue

        if (setSpeciesProperty(taxId,
                               'gc-content',
                               "%g" % val,
                               "entrez",
                               overwrite=False)):
            print("[gc-content (taxid=%d) -> %g]" % (taxId, val))

    for taxId, val in genomeSize.items():
        setSpeciesProperty(taxId,
                           'genome-size-mb',
                           "%g" % val,
                           "entrez",
                           overwrite=False)

    return 0

    genomeIdentifiers = taxIdToGenomeId(
        3055)  # Obtain genome-ids for this tax-id
    for genomeId in genomeIdentifiers:

        report = fetchEntrezGenomeReportForSpecies(genomeId)
        props = parseNCBIGenomeHTML_fetchSummaryReport(report)
        print(props)
    #return 0
    #------------
    #for fn in ("NCBI_genome_1030.html", "NCBI_genome_1070.html", "NCBI_genome_1347.html"):
    #    with open(fn, "r") as f:
    #        print("Testing %s..." % fn)
    #        props = parseNCBIGenomeHTML_fetchSummaryReport(f.read())
    #        print(props)

    for fn in ("NCBI_genomes_report_15_table.txt", ):
        with open(fn, "r") as f:
            print("Testing %s..." % fn)
            (genomeId,
             assemblyId) = parseNCBIGenomeAssembliesHTML_fetchMainAssembly(
                 fixMissingImgCloseTags(wrapTableFragmentAsXML(f.read())))
            print((genomeId, assemblyId))

    return 0

#{'Accept2 (repeat)': '0', '#Vieira_Silva_Species': 'Yersinia pestis', 'Internal_species': 'Aspergillus niger', 'TieWarning': 'True', 'CheckConcensus': '0', 'GrowthTime': '1.25', 'Accept (manual decision \xe2\x80\x93 store into redis yes/no)': '0', 'Internal_taxid': '5061'}
rowAcceptYesNo = 'Accept (manual decision \xe2\x80\x93 store into redis yes/no)'
rowTaxId = 'Internal_taxid'
rowGrowthTime = 'GrowthTime'

numAcceptedRows = 0
with open( viera_silva_supp_tableA1_mapping_filename, "r") as csvfile:
    for row in csv.DictReader(csvfile, delimiter="\t"):

        assert(len(row)==8)

        if int(row[rowAcceptYesNo]) != 1: continue

        growthTimeHours = float(row[rowGrowthTime]) # make sure value is valid floating-point number (however original string repr. will be stored)

        taxId = int(row[rowTaxId])

        if not dryRun:
            setSpeciesProperty( taxId, "growth-time-hours", row[rowGrowthTime], "Vieira-Silva table A1", overwrite=overwriteValues )
        else:
            print( taxId, "growth-time-hours", row[rowGrowthTime], "Vieira-Silva table A1", overwriteValues )
        
        numAcceptedRows += 1


print("numAcceptedRows: {}".format(numAcceptedRows))
            
                
def runDistributed():
    import _distributed
    import dask

    scheduler = _distributed.open()

    results = {}

    #taxids = []
    delayedCalls = []

    fractionSize = 20

    for taxId in allSpeciesSource():

        if randint(0, 20) > 0:
            continue

        if not getSpeciesProperty(taxId, 'paired-mRNA-fraction')[0] is None:
            continue

        size = countSpeciesCDS(taxId)

        numFractions = size / fractionSize
        for i in range(numFractions):
            call = dask.delayed(calcNativePairedFraction)(taxId, i,
                                                          numFractions)
            delayedCalls.append(call)
            #taxids.append(taxId)

    print("Starting %d calls..." % len(delayedCalls))

    futures = scheduler.compute(
        delayedCalls
    )  # submit all delayed calculations; obtain futures immediately

    try:
        _distributed.progress(futures)  # wait for all calculations to complete
    except Exception as e:
        print(E)
    print("\n")

    print("Waiting for all tasks to complete...")
    _distributed.wait(futures)

    results = {}
    errorsCount = 0
    for f in futures:
        try:
            (taxId, fraction, cdsCount, countPairedNucleotides,
             countTotalNucleotides) = scheduler.gather(f)

            current = None
            if taxId in results:
                current = results[taxId]
            else:
                current = (0, 0, 0, set())

            current = (current[0] + cdsCount,
                       current[1] + countPairedNucleotides,
                       current[2] + countTotalNucleotides,
                       current[3].union(set((fraction, ))))

            results[taxId] = current

        except Exception as e:
            print(e)
            errorsCount += 1

    for taxId, result in results.items():
        if len(result[3]) != max(result[3]) + 1:
            #raise Exception("Found invalid number of items for taxId=%d" % taxId)
            print("Found invalid number of items for taxId=%d" % taxId)
            continue

        fraction = float(result[1]) / result[2]

        setSpeciesProperty(taxId,
                           "paired-mRNA-fraction",
                           "%.4g" % fraction,
                           "computed (v3)",
                           overwrite=False)

        print("TaxId: %d\t\tFraction: %.4g" % (taxId, fraction))

    print("Finished %d species with %d errors" % (len(results), errorsCount))
    return results