def testKSpectrum(self): spec = nc.features.kSpectrum(5) testSeq = nc.sequence('', 'GAGAGAGAGT') kmerFreq = { k: v for k, v in zip([f.kmer for f in spec], spec.getAll(testSeq)) } self.assertEqual(kmerFreq['AGAGT'], 100.) self.assertEqual(kmerFreq['ACTCT'], 100.) self.assertEqual(kmerFreq['AGAGA'], 200.) self.assertEqual(kmerFreq['TCTCT'], 200.) self.assertEqual(kmerFreq['GAGAG'], 300.) self.assertEqual(kmerFreq['CTCTC'], 300.) self.assertEqual( sum(kmerFreq[kmer] for kmer in kmerFreq.keys() if kmer not in ['AGAGT', 'ACTCT', 'AGAGA', 'TCTCT', 'GAGAG', 'CTCTC']), 0.) # Ensure that application to the reverse complement yields an identical spectrum specmotifs = set(k for k, f in zip([f.kmer for f in spec], spec.getAll(testSeq)) if f > 0) specmotifsRC = set(k for k, f in zip( [f.kmer for f in spec], spec.getAll( nc.sequence( '', nc.getReverseComplementaryDNASequence(testSeq.seq)))) if f > 0) self.assertEqual(specmotifs, specmotifsRC)
def testKSpectrumMM(self): spec = nc.features.kSpectrumMM(5) testSeq = nc.sequence('', 'GAGAGAGAGT') motifs = set([ testSeq.seq[i:i+5] for i in range(len(testSeq)-4) ]) motifs = motifs | set( nc.getReverseComplementaryDNASequence(m) for m in motifs ) def getMM(m): return set( m[:i] + mut + m[i+1:] for i in range(len(m)) for mut in [ 'A', 'C', 'G', 'T' ] ) motifs = motifs | set( mm for m in motifs for mm in getMM(m) ) # Ensure that only the main motif and mismatches are registered specmotifs = set( k for k, f in zip( [ f.kmer for f in spec ], spec.getAll(testSeq) ) if f > 0 ) self.assertEqual(specmotifs, motifs) # Ensure that application to the reverse complement yields an identical spectrum specmotifsRC = set( k for k, f in zip( [ f.kmer for f in spec ], spec.getAll(nc.sequence('', nc.getReverseComplementaryDNASequence(testSeq.seq))) ) if f > 0 ) self.assertEqual(specmotifs, specmotifsRC)
def prep(): global genome global rsA, rsB global PcG global gwWin global PcGTargets global PRESeq global MC global testSeqs system('rm -rf ./temp') system('mkdir ./temp') genome = nc.streamFASTAGZ('tutorial/DmelR5.fasta.gz', restrictToSequences = [ '2L', '2R', '3L', '3R', '4', 'X' ]) # rsA = nc.regions( 'A', [ nc.region('X', 20, 50), nc.region('X', 80, 100), nc.region('X', 150, 300), nc.region('X', 305, 400), nc.region('X', 500, 600), nc.region('Y', 40, 100), nc.region('Y', 120, 200) ] ) # rsB = nc.regions( 'B', [ nc.region('X', 30, 40), nc.region('X', 90, 120), nc.region('X', 140, 150), nc.region('X', 300, 310), nc.region('Y', 40, 100), nc.region('Y', 130, 300), nc.region('Y', 600, 700) ] ) # # Prepare data set # PcG = nc.biomarkers('PcG', [ nc.loadGFFGZ('tutorial/Pc.gff3.gz').deltaResize(1000).rename('Pc'), nc.loadGFFGZ('tutorial/Psc.gff3.gz').deltaResize(1000).rename('Psc'), nc.loadGFFGZ('tutorial/dRING.gff3.gz').deltaResize(1000).rename('dRING'), nc.loadGFFGZ('tutorial/H3K27me3.gff3.gz').rename('H3K27me3'), ]) # gwWin = nc.getSequenceWindowRegions( genome, windowSize = 1000, windowStep = 100) # PcGTargets = PcG.HBMEs(gwWin, threshold = 4) # PRESeq = PcGTargets.recenter(3000).extract(genome) random.shuffle(PRESeq.sequences) # MC = nc.MarkovChain(trainingSequences = genome, degree = 4, pseudoCounts = 1, addReverseComplements = True) # testSeqs = nc.sequences('Test', [ nc.sequence('X', ''.join( random.choice(['A', 'C', 'G', 'T']) for _ in range(800) )), nc.sequence('Y', ''.join( random.choice(['A', 'C', 'G', 'T']) for _ in range(1000) )), ])
import random import gnocis as nc genome = nc.streamFASTAGZ( 'tutorial/DmelR5.fasta.gz', restrictToSequences=['2L', '2R', '3L', '3R', '4', 'X']) testSeqs = nc.sequences('Test', [ nc.sequence( 'X', ''.join(random.choice(['A', 'C', 'G', 'T']) for _ in range(800))), nc.sequence( 'Y', ''.join(random.choice(['A', 'C', 'G', 'T']) for _ in range(1000))), ])