def test_minoverlap(self): ''' test different values for ``minoverlap`` config parameter ''' seqs = ( "TCGATGCGATCTGTCAAGTCGGTGGCGGTA...", # end of sequence + junk "TCGATGCGATCTG.CAAGTCGGTGGCGGTA...", # end of sequence + junk + 1 error "...NTGAACGTATCGCCTCGAGGGACTT", # junk + beginning of sequence "...NTGAACGTATCG.CTCGAGGGACTT", # junk + beginning of sequence + 1 error ) engine.config( maxerrors=0, minreadlength=25, minoverlap=30, Amin='!' ) ret = engine.findseqs(self.fname, seqs) hits = ret['hits'] assert len(hits)==1 and hits[0].seq_nr==0 and hits[0].seq_pos<0 engine.config(maxerrors=0, minoverlap=25) hits = engine.findseqs(self.fname, seqs)['hits'] assert len(hits)==2 for hit in hits: assert hit[0]!=3 or hit[2]>0 engine.config(maxerrors=1, minoverlap=25) hits = engine.findseqs(self.fname, seqs)['hits'] assert len(hits)==4
def test_Amin(self): seqs = ( "GGAG", "CCGAC", ) engine.config(Amin='H', minreadlength=4, maxerrors=0) ret = engine.findseqs(self.fname, seqs) assert len(ret['hits']) == 1 assert ret['stats']['readlengths'][5] == 3 assert ret['stats']['readlengths'][4] == 5 engine.config(Amin='G') ret = engine.findseqs(self.fname, seqs) assert len(ret['hits']) == 2
def test_hits(self): fq = FastqGenerator(self.tfn.name, force=True) seq = fq.randseq(51) minoverlap = 25 readlength = 100 pmax = .05 n = 100 for i in range(n): fq.cover_seq(seq, minoverlap=minoverlap, readlength=readlength, pmax=pmax) fq.flush() #print "\033[94mfilesize=%.2f MB\033[m" % (fq.size() / 1024. / 1024.) fq = Fastq(self.tfn.name, quiet=True) engine.config( nthreads=3, Amin=fq.Q2A(fq.p2Q(pmax)), maxerrors=0, minreadlength=random.randint(minoverlap, readlength), minoverlap=minoverlap ) ret = engine.findseqs(fq.fname, [seq]) assert ret['stats']['readlengths'][readlength] == n assert len(ret['hits']) == n if 0: print('hits=%d'%len(ret['hits'])) print('readlenghts='+', '.join(['%dx %dbp'%(n, idx) for idx,n in enumerate(ret['stats']['readlengths']) if n])) seqx = ''.join([i%minoverlap!=0 and b or {'A':'C','C':'G','G':'T','T':'A'}[b] for i,b in enumerate(seq)]) ret = engine.findseqs(fq.fname, [seqx]) if 0: print('0123456789'*6) print(('*'+' '*(minoverlap-1))*6) print(seq) print(seqx) print(str(ret['hits'])) assert ret['stats']['readlengths'][readlength] == n assert len(ret['hits']) == 0
def test_forward_fastq(self): engine.config(Amin='#', nthreads=2, minoverlap=80) for n in [3, 5, 7, 133]: for plus in ['+', '+IDENTIFIER']: for cr in ['\n', '\r\n']: record = '@IDENTIFIER' + cr + 'A' * 80 + cr + \ plus + cr + '#' * 80 + cr file(self.tfn.name, 'wb').write(record * n) Fastq(self.tfn.name, quiet=True) ret = engine.findseqs(self.tfn.name, ['A'*80]) assert len(ret['hits']) == n
def test_paired(self, gz=False): engine.config(maxerrors=0, minoverlap=1000, minreadlength=3, Amin='!') seqs = ( "CCC", # "CCCC" should be counted 2x ... "TTTT", "TATATATA", "TGTAG", # at beginning "ATATT", # at end "GAGCATGTGGAGCAACTTGTGGGAGCGCCGGGCAACGCCCTGTCTCTTAT", "...NACTTCCTCTCTACTGGTGTCGGCGGTGAAAGAGCTTACGTACTCTTCGAT...", ) fname = self.fname fnames = (self.fname_1, self.fname_2) if gz: fname += '.gz' fnames = (self.fname_1 + '.gz', self.fname_2 + '.gz') ret = engine.findseqs(fname, seqs) ret_12 = engine.findseqs(fnames, seqs) assert ret == ret_12
def test_fastq(self): file(self.tfn.name, 'w').write('''_IDENTIFIER ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT + ############################################# ''') try: engine.findseqs(self.tfn.name, []) assert False, "malformed @IDENTIFIER must raise FastqFileFormatException" except FastqFileFormatException: pass file(self.tfn.name, 'w').write('''@IDENTIFIER ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT - ############################################# ''') try: engine.findseqs(self.tfn.name, []) assert False, "malformed 3rd line must raise FastqFileFormatException" except FastqFileFormatException: pass
def test_maxerror(self): ''' test different values for ``maxerror`` config parameter ''' engine.config(minreadlength=25, minoverlap=25, Amin='!') seqs = ( #GAGCATGTGGAGCAACTTGTGGGAGCGCCGGGCAACGCCCTGTCTCTTAT "CAGCATGTGGAGCAACTTGTGGGAGCGCCGGGCAACGCCCTGTCTCTTAT", #^ : 1 error "CTGCATGTGGAGCAACTTGTGGGAGCGCCGGGCAACGCCCTGTCTCTTAT", #^^: 2 errors "CTCCATGTGGAGCAACTTGTGGGAGCGCCGGGCAACGCCCTGTCTCTTAT", #^^^: 3 errors ) for maxerrors in range(4): engine.config( maxerrors=maxerrors ) hits = engine.findseqs(self.fname, seqs)['hits'] assert len(hits) == maxerrors
def test_findseqs(self, gz=False): ''' find specified sequences in handwritten .fastq file ''' engine.config(maxerrors=0, minoverlap=1000, minreadlength=3, Amin='!') seqs = ( "CCC", # "CCCC" should be counted 2x ... "TTTT", "TATATATA", "TGTAG", # at beginning "ATATT", # at end "GAGCATGTGGAGCAACTTGTGGGAGCGCCGGGCAACGCCCTGTCTCTTAT", "...NACTTCCTCTCTACTGGTGTCGGCGGTGAAAGAGCTTACGTACTCTTCGAT...", ) fname = self.fname if gz: fname += '.gz' hits = engine.findseqs(fname, seqs)['hits'] if gz: f = gzip.GzipFile(fname, 'rb') else: f = file(fname, 'rb') x = [0] * len(seqs) for hit in hits: x[hit.seq_nr] += 1 seq = seqs[hit.seq_nr] if hit.seq_pos<0: f.seek(hit.file_pos-hit.seq_pos) bps = f.read(hit.length) else: f.seek(hit.file_pos) bps = f.read(hit.length) seq = seq[hit.seq_pos:hit.seq_pos+hit[3]] assert bps == seq assert x == [19,1,0,1,1,1,1]