def getIdentifiersConversionTableUsingGff3():
    global altIdentifiers

    if altIdentifiers:
        return altIdentifiers

    gm = getGenomeModelFromCache(taxId)

    for protId in SpeciesCDSSource(taxId):
        cds = CDSHelper(taxId, protId)
        geneId = cds.getGeneId()
        alts = gm.findEquivalentIdentifiers(geneId)
        for i in alts:
            altIdentifiers[i] = protId
        altIdentifiers[geneId] = protId
Exemple #2
0
def testSpecies(taxId):
    paData = getSpeciesPaxdbData( taxId )

    countFound = 0
    countNotFound = 0
    
    for protId in SpeciesCDSSource(taxId):
        cds = CDSHelper( taxId=taxId, protId=protId )
        geneId = cds.getGeneId()
        
        if geneId in paData:
            countFound += 1
        else:
            countNotFound += 1

    print("Species: {} -> Found: {} ({:.3}%) Not found: {}".format(taxId, countFound, countFound/(countFound+countNotFound)*100, countNotFound))
    return( countFound, countNotFound)
def processGenome(args, taxId):

    alreadyProcessedGenes = {}
    totalProteinsProcessed = 0
    totalSkipped = 0

    seqsForWriting=[]
    recordsForWriting={}
    
    gm = getGenomeModelFromCache( taxId )

    for protId in SpeciesCDSSource(taxId):
        cds = CDSHelper( taxId, protId )
        totalProteinsProcessed += 1

        #feature = gm.findFeatureById( protId )
        geneId = cds.getGeneId()

        #flanking3UTRRegionLengthNt = cds.flankingRegion3UtrLength()

        feature = gm.findFeatureById( protId )
        #feature = cds.getMatchingFeatureFromGenomeModel()
        #print(feature)
        strand = feature[1].data['strand']

        if strand=='+':
            otherFeature = gm.moleculeModels[ feature[0] ].find5PrimeFlankingRegion( feature[1] )

            if otherFeature is None:
                totalSkipped += 1
                continue

            assert( otherFeature['downstream-feature'].begin <= otherFeature['downstream-feature'].end)
            flanking3UTRRegionLengthNt = otherFeature['curr-feature'].begin       -  otherFeature['downstream-feature'].end

            threePrimeUTRCoords = (feature[1].begin-20, feature[1].begin+2, False) # include the first 3 nucleotides of the CDS

        else:
            otherFeature = gm.moleculeModels[ feature[0] ].find5PrimeFlankingRegion( feature[1] )

            if otherFeature is None:
                totalSkipped += 1
                continue

            assert( otherFeature['downstream-feature'].begin <= otherFeature['downstream-feature'].end)
            flanking3UTRRegionLengthNt = otherFeature['downstream-feature'].begin - otherFeature['curr-feature'].end

            threePrimeUTRCoords = (feature[1].end-3, feature[1].end+20, True) # include the first 3 nucleotides of the CDS

        threePrimeUTR = gm.moleculeModels[ feature[0] ].getSequence( *threePrimeUTRCoords )

        if flanking3UTRRegionLengthNt < -50:
            print("Warning: found gene with apparent long overlap: {},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq ))
            #totalSkipped += 1
            #continue

        if threePrimeUTR.seq[-2:] != 'TG':
            print("Warning: skipping gene with start codon at the correct place: {},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq ))
            totalSkipped += 1
            continue

        # All done - emit the output
        #fout.write("{},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq ))
        recordsForWriting[protId] = (geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq )

        seqsForWriting.append( SeqRecord( Seq(threePrimeUTR.seq[:-3], NucleotideAlphabet), id=protId) )

    aSD = calculateaSDEnergies( seqsForWriting, args, taxId )
    print(len(aSD))

    with open( outputData.format(taxId), 'wt') as fout:
        for protId, record in recordsForWriting.items():
            aSDval = aSD.get(protId, None)
            vals = (protId,) + record + (aSDval,)
            fout.write("{},{},{},{},{},{}\n".format( *vals ))
    

    print("Processed {} coding sequences for taxid {}".format( totalProteinsProcessed, taxId ))
    print("Skipped {} coding sequences".format( totalSkipped ))