def testSequenceScoringWithStream(self): Kahn2014Rgn = nc.loadGFF('tutorial/Kahn2014.GFF') Kahn2014Seq = Kahn2014Rgn \ .recenter(3000) \ .extract(genome) tneg = MC.generateSet(n=len(Kahn2014Seq), length=3000) trainingSet = Kahn2014Seq.label(nc.positive) + tneg.label(nc.negative) PyPREdictor = nc.motifs.Ringrose2003GTGT() \ .pairFreq(distCut = 219) \ .model(nc.logOdds( labelPositive = nc.positive, labelNegative = nc.negative) ) \ .sequenceModel(name = 'PyPREdictor (M2003+GTGT)', windowSize = 500, windowStep = 250) \ .train(trainingSet) PyPREdictor.batchsize = 100 MC.generateSet(n=1000, length=3000).saveFASTA('./temp/test.Background.fasta') seq1 = nc.streamFASTA('./temp/test.Background.fasta') seq2 = nc.loadFASTA('./temp/test.Background.fasta') score1 = PyPREdictor.getSequenceScores(seq1, nStreamFetch=1000) score2 = PyPREdictor.getSequenceScores(seq2, nStreamFetch=1000) diff = sum(abs(b - a) for a, b in zip(score1, score2)) assert (diff == 0.0) assert (len(score1) == len(score2))
def testExtractRegionSequences(self): testSeqs.saveFASTA('temp/test.fasta') seqByName = {seq.name: seq.seq for seq in testSeqs} # Ensure that saving and re-loading sequences yields identical sequences seqs = rsA.extractSequences(nc.loadFASTA('temp/test.fasta')) self.assertEqual([s.seq for s in seqs], [seqByName[r.seq][r.start:r.end + 1] for r in rsA]) seqs = rsA.extractSequences(nc.streamFASTA('temp/test.fasta')) self.assertEqual([s.seq for s in seqs], [seqByName[r.seq][r.start:r.end + 1] for r in rsA]) # Ensure that streaming of short blocks yields identical final sequences seqs = rsA.extractSequences( nc.streamFASTA('temp/test.fasta', wantBlockSize=50)) self.assertEqual([s.seq for s in seqs], [seqByName[r.seq][r.start:r.end + 1] for r in rsA])
def testSaveLoadFASTA(self): testSeqs.saveFASTA('temp/test.fasta') sA = '\n'.join('%s: %s' % (s.name, s.seq) for s in testSeqs) sB = '\n'.join('%s: %s' % (s.name.split(' from FASTA file')[0], s.seq) for s in nc.loadFASTA('temp/test.fasta')) self.assertEqual(sA, sB)