Beispiel #1
0
def getRandomizedSequenceCacheForVerticalPermutations(taxId):
    global _caches

    if (taxId, db.Sources.ShuffleCDS_vertical_permutation_1nt) in _caches:
        cache = _caches[(taxId, db.Sources.ShuffleCDS_vertical_permutation_1nt)]
        
    else:
        # read all native sequences
        protIds = []
        cdss = []
        for protId in SpeciesCDSSource(taxId):
            cds = CDSHelper(taxId, protId)
            
            if( cds.length()%3 != 0 ):
                continue
            
            seq = cds.sequence()
            
            protIds.append(protId)
            cdss.append(seq)
            
        geneticCode = getSpeciesTranslationTable( taxId )
        scpr = SynonymousCodonPermutingRandomization( geneticCode ) 
        randomizer = lambda cdss: scpr.verticalPermutation( cdss )
        cache = VerticalRandomizationCache(shuffleType=db.Sources.ShuffleCDS_vertical_permutation_1nt,
                                           taxId=taxId,
                                           nativeSeqsMap=dict(zip(protIds, cdss)),
                                           geneticCode=geneticCode,
                                           randomizer=randomizer )
        _caches[(taxId, db.Sources.ShuffleCDS_vertical_permutation_1nt)] = cache
        print(_caches.keys())

        
    return cache
Beispiel #2
0
def writeSequenceToTempFile(taxId):

    print("Fetching sequence for taxid={}".format(taxId))

    allRecords = []
    allCDSs = []

    for protId in SpeciesCDSSource(taxId):
        cds = CDSHelper(taxId, protId)

        if (cds.length() % 3 != 0):
            continue

        seq = cds.sequence()
        allCDSs.append(seq)

        if (len(allCDSs) % 1000 == 999): print(".")

    record = SeqRecord(Seq(''.join(allCDSs), NucleotideAlphabet),
                       id="allCDSs",
                       description="")
    allRecords.append(record)

    fout = NamedTemporaryFile(mode="w", delete=(not debugMode))
    SeqIO.write(allRecords, fout.name,
                "fasta")  # write the full sequences into the file

    return (len(allRecords), fout)
def getIdentifiersConversionTableUsingGff3():
    global altIdentifiers

    if altIdentifiers:
        return altIdentifiers

    gm = getGenomeModelFromCache(taxId)

    for protId in SpeciesCDSSource(taxId):
        cds = CDSHelper(taxId, protId)
        geneId = cds.getGeneId()
        alts = gm.findEquivalentIdentifiers(geneId)
        for i in alts:
            altIdentifiers[i] = protId
        altIdentifiers[geneId] = protId
Beispiel #4
0
def testSpecies(taxId):
    paData = getSpeciesPaxdbData( taxId )

    countFound = 0
    countNotFound = 0
    
    for protId in SpeciesCDSSource(taxId):
        cds = CDSHelper( taxId=taxId, protId=protId )
        geneId = cds.getGeneId()
        
        if geneId in paData:
            countFound += 1
        else:
            countNotFound += 1

    print("Species: {} -> Found: {} ({:.3}%) Not found: {}".format(taxId, countFound, countFound/(countFound+countNotFound)*100, countNotFound))
    return( countFound, countNotFound)
selected = 0
alreadyCompleted = 0
totalMissingResults = 0

queuedDelayedCalls = []

for taxIdForProcessing in species:
    print("Processing %d sequences for tax-id %d (%s)..." %
          (countSpeciesCDS(taxIdForProcessing), taxIdForProcessing,
           getSpeciesName(taxIdForProcessing)))

    stats = Counter()

    # Iterate over all CDS entries for this species
    # TODO - preloading all sequences and results should optimize this
    for protId in SpeciesCDSSource(taxIdForProcessing):

        stats['all-sequences'] += 1

        #protId = codecs.decode(protId)
        # Filtering

        # Only process 1/N of the sequences, selected randomly (N=randomFraction)
        # (if randomFraction==1, all sequences will be processed)
        if (randint(1, randomFraction) != 1):
            skipped += 1
            stats['skipped-random-fraction'] += 1
            continue

        # ------------------------------------------------------------------------------------------
        # Exclude some sequences from the calculation
args = argsParser.parse_args()

# Configuration
taxId = args.taxId

#statsShuffles = RunningStats()
statsShuffles = OfflineStats()

recordsCount = 0
warningsCount = 0

rl = RateLimit(30)

total = countSpeciesCDS(taxId)

for protId in SpeciesCDSSource(taxId):
    cds = CDSHelper(taxId, protId)

    statsShuffles.push(
        cds.dropShuffledSeqs(lastItemToKeep=args.keep_first_n_shuffles))

    recordsCount += 1

    if (rl()):
        print("processed %d records (%.2g%%)" %
              (recordsCount, float(recordsCount) / total * 100))

    # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY #
    #if( recordsCount > 20 ):
    #    break
    # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY #
Beispiel #7
0
def getAllProteins(taxId):
    return list(SpeciesCDSSource(taxId))
def plotStatistics():

    metadata = io.loadmat("{}{}".format(data_path, metadata_file))
    sourceIdentifiersTable = metadata["gene_id"]

    def getSourceGeneId(idx: int) -> str:
        return sourceIdentifiersTable[idx][0][0]

    #print(metadata["gene_id"].shape)
    #print(metadata["gene_id"][1])
    #print(metadata["gene_id"][100])
    #print(metadata["gene_id"][1000])
    #print(metadata["gene_id"][1020])

    idTable = getIdentifiersMapping()

    allData = [
        io.loadmat("{}{}".format(data_path, fn)) for fn in measurement_files
    ]

    for data, fn in zip(allData, measurement_files):
        plotDatafileStatistics(data, fn)

    RPratios = np.stack([readReadthroughData(fn)[5] for fn in allData])
    ORFreads = np.stack([readReadthroughData(fn)[3] for fn in allData])
    ORFreads[np.isnan(ORFreads)] = 0.0
    print(ORFreads.shape)
    RPratios_ = RPratios.copy()
    RPratios_[np.isnan(RPratios_)] = 0.0
    RPratios_[np.isinf(RPratios_)] = 0.0
    print("//")
    print(np.min(RPratios[~np.isnan(RPratios)]))
    print(np.max(RPratios[~np.isnan(RPratios)]))
    print(np.min(RPratios_))
    print(np.max(RPratios_))

    # Does the "RP ratios" metric correlate between the different experiments?
    rs = spearmanr(RPratios, axis=1, nan_policy="omit").correlation

    fig, ax1 = plt.subplots()
    sns.heatmap(rs, annot=True, ax=ax1)
    plt.savefig("RP_distribution_spearman.pdf")
    plt.close(fig)

    print(RPratios[0, :].shape)
    #qs  = np.quantile( RPratios_, 0.90, axis=1 )
    #qs3 = np.quantile( RPratios_, 0.95, axis=1 )
    #qs3 = np.quantile( RPratios_, 0.99, axis=1 )
    #print(qs)
    #print(qs2)
    #print(qs3)
    #for t in (0.1, 0.2, 0.3, 0.8, 0.9, 0.99, 0.999):
    #    print( np.quantile( RPratios_, t, axis=1 ) )

    #tt1 = np.quantile( RPratios_, 0.985, axis=1 )

    #selectedPos = np.any( (RPratios_.T >  tt1), axis=1 )
    #selectedNeg = np.all( (RPratios_.T <= tt1), axis=1 ) & np.any(ORFreads > 0.0, axis=0)

    from data_helpers import SpeciesCDSSource, setCDSProperty

    for i, fn in enumerate(measurement_files):

        selectedPos = frozenset(
            np.nonzero(
                RPratios[i, np.isfinite(RPratios[i, :])] > readthroughThreshold
            )[0])
        selectedNeg = frozenset(
            np.nonzero(RPratios[i, np.isfinite(RPratios[i, :])] <=
                       readthroughThreshold)[0])
        print("///////////////////////")
        print(i)
        # print("++")
        # print( len(selectedPos) )
        # print("--")
        # print( len(selectedNeg) )

        positiveIdentifiersSourceFmt = frozenset(
            [getSourceGeneId(x) for x in selectedPos])
        negativeIdentifiersSourceFmt = frozenset(
            [getSourceGeneId(x) for x in selectedNeg])
        assert (not positiveIdentifiersSourceFmt.intersection(
            negativeIdentifiersSourceFmt))

        positiveIdentifiersNativeFmt = [
            idTable.get(x, None) for x in positiveIdentifiersSourceFmt
        ]
        negativeIdentifiersNativeFmt = [
            idTable.get(x, None) for x in negativeIdentifiersSourceFmt
        ]

        # good = 0
        # bad = 0
        # out = []
        # for pos in selectPosIndices:
        #     sourceIds = metadata["gene_id"][pos]
        #     x = sourceIds[0][0]
        #     print(x)
        #     if x in idTable:
        #         good += 1
        #         out.append(idTable[x])
        #     else:
        #         bad += 1
        # print("good={} bad={}".format(good, bad))

        countMarkedPositive = 0
        countMarkedNegative = 0

        for protId in SpeciesCDSSource(taxId):
            valForProt = None

            if protId in positiveIdentifiersNativeFmt:
                valForProt = "1"
                countMarkedPositive += 1

            elif protId in negativeIdentifiersNativeFmt:
                valForProt = "0"
                countMarkedNegative += 1

            if not valForProt is None:
                setCDSProperty(taxId,
                               protId,
                               "readthrough-v2.ex{}".format(i),
                               valForProt,
                               overwrite=True)

        print(countMarkedPositive)
        print(countMarkedNegative)
server = "http://rest.ensemblgenomes.org"
ext = "/xrefs/id/%s?content-type=application/json;all_levels=1"


def getRecord(protid):
    r = requests.get(server + ext % protid)
    #headers={ "Content-Type" : "application/json"})

    if not r.ok:
        r.raise_for_status()
        sys.exit()

    # Rate-limit requests
    sleep(0.5)

    decoded = r.json()
    return decoded


for protid in SpeciesCDSSource(taxid):
    record = getRecord(protid)
    print(record)

#with open(f, 'r') as csvfile:
#    for row in csv.reader(csvfile, delimiter='\t'):
#        #['3706992', '224308.Bsubs1_010100004063', '4.46']

#        paxId = row[1].split(".")[1]
#        pa = float(row[2])
def processGenome(args, taxId):

    alreadyProcessedGenes = {}
    totalProteinsProcessed = 0
    totalSkipped = 0

    seqsForWriting=[]
    recordsForWriting={}
    
    gm = getGenomeModelFromCache( taxId )

    for protId in SpeciesCDSSource(taxId):
        cds = CDSHelper( taxId, protId )
        totalProteinsProcessed += 1

        #feature = gm.findFeatureById( protId )
        geneId = cds.getGeneId()

        #flanking3UTRRegionLengthNt = cds.flankingRegion3UtrLength()

        feature = gm.findFeatureById( protId )
        #feature = cds.getMatchingFeatureFromGenomeModel()
        #print(feature)
        strand = feature[1].data['strand']

        if strand=='+':
            otherFeature = gm.moleculeModels[ feature[0] ].find5PrimeFlankingRegion( feature[1] )

            if otherFeature is None:
                totalSkipped += 1
                continue

            assert( otherFeature['downstream-feature'].begin <= otherFeature['downstream-feature'].end)
            flanking3UTRRegionLengthNt = otherFeature['curr-feature'].begin       -  otherFeature['downstream-feature'].end

            threePrimeUTRCoords = (feature[1].begin-20, feature[1].begin+2, False) # include the first 3 nucleotides of the CDS

        else:
            otherFeature = gm.moleculeModels[ feature[0] ].find5PrimeFlankingRegion( feature[1] )

            if otherFeature is None:
                totalSkipped += 1
                continue

            assert( otherFeature['downstream-feature'].begin <= otherFeature['downstream-feature'].end)
            flanking3UTRRegionLengthNt = otherFeature['downstream-feature'].begin - otherFeature['curr-feature'].end

            threePrimeUTRCoords = (feature[1].end-3, feature[1].end+20, True) # include the first 3 nucleotides of the CDS

        threePrimeUTR = gm.moleculeModels[ feature[0] ].getSequence( *threePrimeUTRCoords )

        if flanking3UTRRegionLengthNt < -50:
            print("Warning: found gene with apparent long overlap: {},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq ))
            #totalSkipped += 1
            #continue

        if threePrimeUTR.seq[-2:] != 'TG':
            print("Warning: skipping gene with start codon at the correct place: {},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq ))
            totalSkipped += 1
            continue

        # All done - emit the output
        #fout.write("{},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq ))
        recordsForWriting[protId] = (geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq )

        seqsForWriting.append( SeqRecord( Seq(threePrimeUTR.seq[:-3], NucleotideAlphabet), id=protId) )

    aSD = calculateaSDEnergies( seqsForWriting, args, taxId )
    print(len(aSD))

    with open( outputData.format(taxId), 'wt') as fout:
        for protId, record in recordsForWriting.items():
            aSDval = aSD.get(protId, None)
            vals = (protId,) + record + (aSDval,)
            fout.write("{},{},{},{},{},{}\n".format( *vals ))
    

    print("Processed {} coding sequences for taxid {}".format( totalProteinsProcessed, taxId ))
    print("Skipped {} coding sequences".format( totalSkipped ))
Beispiel #11
0
def readSeriesResultsForSpecies(seriesSourceNumber,
                                species,
                                minShuffledGroups=20,
                                maxShuffledGroups=20,
                                shuffleType=db.Sources.ShuffleCDSv2_python,
                                cdsFilter=None,
                                returnCDS=True):
    if isinstance(
            species, Iterable
    ):  # usually, species will be a sequence of numeric taxid values
        if isinstance(species, basestring):
            raise Exception("species cannot be string")
        # all set - proceed...
    else:
        species = (species, )  # assume we got a single (numeric) taxid value
    assert (minShuffledGroups <= maxShuffledGroups)

    for taxIdForProcessing in species:
        print("Procesing %d sequences for tax-id %d (%s)..." %
              (countSpeciesCDS(taxIdForProcessing), taxIdForProcessing,
               getSpeciesName(taxIdForProcessing)))

        computed = getAllComputedSeqsForSpecies(seriesSourceNumber,
                                                taxIdForProcessing,
                                                maxShuffledGroups,
                                                shuffleType=shuffleType)
        computedIds = frozenset(computed.keys())
        print("Collecting data from %d computation results..." % len(computed))

        skipped = 0
        selected = 0
        alreadyCompleted = 0

        # Iterate over all CDS entries for this species
        for protId in SpeciesCDSSource(taxIdForProcessing):
            cds = CDSHelper(taxIdForProcessing, protId)

            if (not cdsFilter is None) and (not cdsFilter(cds)):
                continue

            cdsSeqId = cds.seqId()

            shuffledIds = cds.shuffledSeqIds(shuffleType=shuffleType)

            # How many shuffles (for this cds) exist in the data we found?
            computedShufflesCount = len(
                computedIds.intersection(frozenset(shuffledIds)))

            if (computedShufflesCount < minShuffledGroups
                    or (not cdsSeqId in computedIds)):
                #print("%s - found only %d groups, skipping" % (protId, computedShufflesCount))
                skipped += 1
                continue

            # Get the computed results for this CDS
            seqIds = [cds.seqId()]
            seqIds.extend(cds.shuffledSeqIds(shuffleType=shuffleType))
            if (len(seqIds) > maxShuffledGroups + 1):
                seqIds = seqIds[:maxShuffledGroups + 1]
            results = [computed.get(x) for x in seqIds]

            if (results is None or len([() for x in results if not x is None])
                    < minShuffledGroups):
                print("Not enough results found for %s" % protId)
                skipped += 1
                continue

            # Decode the results
            results = list(
                map(
                    lambda x: decodeJsonSeriesRecord(decompressSeriesRecord(x))
                    if not x is None else None, results))
            if (returnCDS):
                yield {
                    "taxid": taxIdForProcessing,
                    "content": results,
                    "cds": cds
                }
            else:
                yield {"taxid": taxIdForProcessing, "content": results}
            del results
            del cds
            selected += 1

            if (rl()):
                print("# %s - %d records included, %d records skipped" %
                      (datetime.now().isoformat(), selected, skipped))