def getIdentifiersConversionTableUsingGff3(): global altIdentifiers if altIdentifiers: return altIdentifiers gm = getGenomeModelFromCache(taxId) for protId in SpeciesCDSSource(taxId): cds = CDSHelper(taxId, protId) geneId = cds.getGeneId() alts = gm.findEquivalentIdentifiers(geneId) for i in alts: altIdentifiers[i] = protId altIdentifiers[geneId] = protId
def testSpecies(taxId): paData = getSpeciesPaxdbData( taxId ) countFound = 0 countNotFound = 0 for protId in SpeciesCDSSource(taxId): cds = CDSHelper( taxId=taxId, protId=protId ) geneId = cds.getGeneId() if geneId in paData: countFound += 1 else: countNotFound += 1 print("Species: {} -> Found: {} ({:.3}%) Not found: {}".format(taxId, countFound, countFound/(countFound+countNotFound)*100, countNotFound)) return( countFound, countNotFound)
def processGenome(args, taxId): alreadyProcessedGenes = {} totalProteinsProcessed = 0 totalSkipped = 0 seqsForWriting=[] recordsForWriting={} gm = getGenomeModelFromCache( taxId ) for protId in SpeciesCDSSource(taxId): cds = CDSHelper( taxId, protId ) totalProteinsProcessed += 1 #feature = gm.findFeatureById( protId ) geneId = cds.getGeneId() #flanking3UTRRegionLengthNt = cds.flankingRegion3UtrLength() feature = gm.findFeatureById( protId ) #feature = cds.getMatchingFeatureFromGenomeModel() #print(feature) strand = feature[1].data['strand'] if strand=='+': otherFeature = gm.moleculeModels[ feature[0] ].find5PrimeFlankingRegion( feature[1] ) if otherFeature is None: totalSkipped += 1 continue assert( otherFeature['downstream-feature'].begin <= otherFeature['downstream-feature'].end) flanking3UTRRegionLengthNt = otherFeature['curr-feature'].begin - otherFeature['downstream-feature'].end threePrimeUTRCoords = (feature[1].begin-20, feature[1].begin+2, False) # include the first 3 nucleotides of the CDS else: otherFeature = gm.moleculeModels[ feature[0] ].find5PrimeFlankingRegion( feature[1] ) if otherFeature is None: totalSkipped += 1 continue assert( otherFeature['downstream-feature'].begin <= otherFeature['downstream-feature'].end) flanking3UTRRegionLengthNt = otherFeature['downstream-feature'].begin - otherFeature['curr-feature'].end threePrimeUTRCoords = (feature[1].end-3, feature[1].end+20, True) # include the first 3 nucleotides of the CDS threePrimeUTR = gm.moleculeModels[ feature[0] ].getSequence( *threePrimeUTRCoords ) if flanking3UTRRegionLengthNt < -50: print("Warning: found gene with apparent long overlap: {},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq )) #totalSkipped += 1 #continue if threePrimeUTR.seq[-2:] != 'TG': print("Warning: skipping gene with start codon at the correct place: {},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq )) totalSkipped += 1 continue # All done - emit the output #fout.write("{},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq )) recordsForWriting[protId] = (geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq ) seqsForWriting.append( SeqRecord( Seq(threePrimeUTR.seq[:-3], NucleotideAlphabet), id=protId) ) aSD = calculateaSDEnergies( seqsForWriting, args, taxId ) print(len(aSD)) with open( outputData.format(taxId), 'wt') as fout: for protId, record in recordsForWriting.items(): aSDval = aSD.get(protId, None) vals = (protId,) + record + (aSDval,) fout.write("{},{},{},{},{},{}\n".format( *vals )) print("Processed {} coding sequences for taxid {}".format( totalProteinsProcessed, taxId )) print("Skipped {} coding sequences".format( totalSkipped ))