def createRandomizedSeqs_CDS_with_3UTR( cds, newShuffleIds, shuffleType=db.Sources. ShuffleCDS_synon_perm_and_3UTR_nucleotide_permutation, taxId=None): #NucleotidePermutationRandomization, CDSand3UTRRandomization cdsRand = SynonymousCodonPermutingRandomization(cds.getTranslationTable()) utrRand = NucleotidePermutationRandomization() if shuffleType == db.Sources.ShuffleCDS_synon_perm_and_3UTR_nucleotide_permutation: shuffler = CDSand3UTRRandomization(cdsRand, utrRand) elif shuffleType == db.Sources.ShuffleCDS_synon_perm_and_3UTR_nucleotide_permutation_Including_Next_CDS: shuffler = CDSand3UTRRandomizationIncludingNextCDS( cdsRand, utrRand, taxId=taxId, constantOverlaps=False) elif shuffleType == db.Sources.ShuffleCDS_synon_perm_and_3UTR_nucleotide_permutation_Including_Next_CDS_Constant_Overlaps: shuffler = CDSand3UTRRandomizationIncludingNextCDS( cdsRand, utrRand, taxId=taxId, constantOverlaps=True) else: raise Exception("Unknown shuffleType={}".format(shuffleType)) genomeModel = getGenomeModelFromCache(cds.getTaxId()) nativeSeq = cds.sequence() stopCodonPos = cds.CDSlength() #print(nativeSeq[:10]) newShuffles = [] for shuffleId in newShuffleIds: totalPermutationsCount, identity, newseq = None, None, None try: totalPermutationsCount, identity, newseq = shuffler.randomize( nativeSeq, cds.getProtId()) except Exception as e: print(e) raise assert ((identity <= 1.0) and (identity > 0.0)) if (identity > 0.95): print( "Warning: Identity of randomized sequence is high - %.3g%% (length=%d nt, total permutations=%.2g)" % (identity * 100.0, len(newseq), totalPermutationsCount)) if (totalPermutationsCount < 500): raise Exception( "Low number of possible permutations %.2g (length=%d nt, identity=%.3g%%)" % (totalPermutationsCount, len(newseq), identity * 100.0)) newShuffles.append(newseq) return newShuffles
def getIdentifiersConversionTableUsingGff3(): global altIdentifiers if altIdentifiers: return altIdentifiers gm = getGenomeModelFromCache(taxId) for protId in SpeciesCDSSource(taxId): cds = CDSHelper(taxId, protId) geneId = cds.getGeneId() alts = gm.findEquivalentIdentifiers(geneId) for i in alts: altIdentifiers[i] = protId altIdentifiers[geneId] = protId
def testCDSand3UTRRandomizationIncludingNextCDS( taxId: int = 511145, geneticCode: int = 11, constantOverlaps: bool = False) -> int: from data_helpers import SpeciesCDSSource from genome_model import getGenomeModelFromCache rand = CDSand3UTRRandomizationIncludingNextCDS( SynonymousCodonPermutingRandomization(geneticCode=geneticCode), NucleotidePermutationRandomization(), taxId, constantOverlaps=constantOverlaps) #for protId in SpeciesCDSSource(taxId): countOK = 0 countNotOK = 0 countNotOK2 = 0 countSkipped = 0 for protId in getGenomeModelFromCache(taxId).allCDSSource(): try: cds = CDSHelper(taxId, protId) seq = cds.sequence() #if str(seq).find("n") != -1: # countSkipped += 1 # continue except Exception as e: countNotOK += 1 continue for i in range(20): try: ret = rand.randomize(seq, protId) except Exception as e: print( "Caught exception during call to randomize(), protId={}!". format(protId)) print(e) countNotOK += 1 countNotOK2 += 1 continue if ret[0] < 1e5: print(protId) if not (len(ret[2]) == len(seq)): print(ret) rand.randomize(seq, protId) assert (len(ret[2]) == len(seq)) countOK += 1 #print("{} -> {}".format( protId, ret )) print("OK: {}, NotOK: {}, Skipped: {}, Total: {}".format( countOK, countNotOK, countSkipped, countOK + countNotOK + countSkipped)) print("randomize exception: {}".format(countNotOK2)) return 0
def processGenome(args, taxId): alreadyProcessedGenes = {} totalProteinsProcessed = 0 totalSkipped = 0 seqsForWriting=[] recordsForWriting={} gm = getGenomeModelFromCache( taxId ) for protId in SpeciesCDSSource(taxId): cds = CDSHelper( taxId, protId ) totalProteinsProcessed += 1 #feature = gm.findFeatureById( protId ) geneId = cds.getGeneId() #flanking3UTRRegionLengthNt = cds.flankingRegion3UtrLength() feature = gm.findFeatureById( protId ) #feature = cds.getMatchingFeatureFromGenomeModel() #print(feature) strand = feature[1].data['strand'] if strand=='+': otherFeature = gm.moleculeModels[ feature[0] ].find5PrimeFlankingRegion( feature[1] ) if otherFeature is None: totalSkipped += 1 continue assert( otherFeature['downstream-feature'].begin <= otherFeature['downstream-feature'].end) flanking3UTRRegionLengthNt = otherFeature['curr-feature'].begin - otherFeature['downstream-feature'].end threePrimeUTRCoords = (feature[1].begin-20, feature[1].begin+2, False) # include the first 3 nucleotides of the CDS else: otherFeature = gm.moleculeModels[ feature[0] ].find5PrimeFlankingRegion( feature[1] ) if otherFeature is None: totalSkipped += 1 continue assert( otherFeature['downstream-feature'].begin <= otherFeature['downstream-feature'].end) flanking3UTRRegionLengthNt = otherFeature['downstream-feature'].begin - otherFeature['curr-feature'].end threePrimeUTRCoords = (feature[1].end-3, feature[1].end+20, True) # include the first 3 nucleotides of the CDS threePrimeUTR = gm.moleculeModels[ feature[0] ].getSequence( *threePrimeUTRCoords ) if flanking3UTRRegionLengthNt < -50: print("Warning: found gene with apparent long overlap: {},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq )) #totalSkipped += 1 #continue if threePrimeUTR.seq[-2:] != 'TG': print("Warning: skipping gene with start codon at the correct place: {},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq )) totalSkipped += 1 continue # All done - emit the output #fout.write("{},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq )) recordsForWriting[protId] = (geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq ) seqsForWriting.append( SeqRecord( Seq(threePrimeUTR.seq[:-3], NucleotideAlphabet), id=protId) ) aSD = calculateaSDEnergies( seqsForWriting, args, taxId ) print(len(aSD)) with open( outputData.format(taxId), 'wt') as fout: for protId, record in recordsForWriting.items(): aSDval = aSD.get(protId, None) vals = (protId,) + record + (aSDval,) fout.write("{},{},{},{},{},{}\n".format( *vals )) print("Processed {} coding sequences for taxid {}".format( totalProteinsProcessed, taxId )) print("Skipped {} coding sequences".format( totalSkipped ))
def readFile(self): self.geneInfo = {} with open(ODB4_csv_path) as csvfile: reader = csv.reader(csvfile, delimiter='\t') rowNum = 0 for row in reader: rowNum += 1 if rowNum == 1: continue # skip header line taxId = int(row[ODB4_format.TaxId]) if (not self.taxIdFilter is None) and (taxId != self.taxIdFilter): continue # filter rows by taxId (if specified) gm = getGenomeModelFromCache(taxId) operonName = row[ODB4_format.OperonName] genesInOrder = row[ODB4_format.GenesOrder].split(',') # For some reason, the genes in ODB4 appear relative to the positive strand, even for operons transcribed from the negative strand... # Consequently, to determine the real order of genes we need to determine the strand for this operon. # First, collect the strands for all genes in this operon strands = [] for gid in genesInOrder: # for each gene, find equivalent identifiers idents = gm.findEquivalentIdentifiers( gid ) # try each identifier to find one associated with a gff3 feature if idents is None: if gid[:2] == "HP" and gid[:3] != "HP_": # Workaround for H. pylori idents = gm.findEquivalentIdentifiers("HP_" + gid[2:]) if idents is None: continue for i in idents: f = gm.findFeatureById(i) if not f is None: strands.append( f[1].data['strand']) # store all strands break if not strands: # With no strand found, we cannot use this operon print("Missing info for {}".format(operonName)) continue # Make sure all strands are the same if not allItemsAreEqual(strands): print("Conflicting info for {}".format(operonName)) continue strand = strands[0] assert (strand in ('+', '-')) # Store position information for every gene in this operon for pos, gene in enumerate(genesInOrder): if gene[:2] == "HP" and gene[:3] != "HP_": # Workaround for H. pylori gene = "HP_" + gene[2:] if strand == '+': geneData = (pos, len(genesInOrder), operonName) else: if taxId != 169963: geneData = (len(genesInOrder) - pos - 1, len(genesInOrder), operonName) else: # Workaround for Listeria geneData = (pos, len(genesInOrder), operonName) if self.taxIdFilter is None: # No filter defined; store the taxId with the entry self.geneInfo[self.getGeneIdentifier( gene, taxId=taxId)] = geneData else: # Taxonomy filtered to single species; no need to store taxId with entry self.geneInfo[self.getGeneIdentifier(gene)] = geneData