Example #1
0
 def setUpClass(cls):
     fastqs = os.path.join(os.path.dirname(__file__), 'fastqs')
     cls.fname = os.path.join(fastqs, 'test_engine.fastq')
     cls.fname_1 = os.path.join(fastqs, 'test_engine_1.fastq')
     cls.fname_2 = os.path.join(fastqs, 'test_engine_2.fastq')
     engine.config(nthreads=1)
     cls.tfn = tempfile.NamedTemporaryFile(suffix='.fastq', delete=False)
Example #2
0
 def test_forward_fastq(self):
     engine.config(Amin='#', nthreads=2, minoverlap=80)
     for n in [3, 5, 7, 133]:
         for plus in ['+', '+IDENTIFIER']:
             for cr in ['\n', '\r\n']:
                 record = '@IDENTIFIER' + cr + 'A' * 80 + cr + \
                         plus + cr + '#' * 80 + cr
                 file(self.tfn.name, 'wb').write(record * n)
                 Fastq(self.tfn.name, quiet=True)
                 ret = engine.findseqs(self.tfn.name, ['A'*80])
                 assert len(ret['hits']) == n
Example #3
0
    def test_Amin(self):
        seqs = (
                "GGAG",
                "CCGAC",
            )
        engine.config(Amin='H', minreadlength=4, maxerrors=0)
        ret = engine.findseqs(self.fname, seqs)

        assert len(ret['hits']) == 1
        assert ret['stats']['readlengths'][5] == 3
        assert ret['stats']['readlengths'][4] == 5

        engine.config(Amin='G')
        ret = engine.findseqs(self.fname, seqs)
        assert len(ret['hits']) == 2
Example #4
0
    def test_hits(self):
        fq = FastqGenerator(self.tfn.name, force=True)
        seq = fq.randseq(51)

        minoverlap = 25
        readlength = 100
        pmax = .05
        n = 100
        for i in range(n):
            fq.cover_seq(seq,
                    minoverlap=minoverlap,
                    readlength=readlength,
                    pmax=pmax)
        fq.flush()
        #print "\033[94mfilesize=%.2f MB\033[m" % (fq.size() / 1024. / 1024.)

        fq = Fastq(self.tfn.name, quiet=True)

        engine.config(
                nthreads=3,
                Amin=fq.Q2A(fq.p2Q(pmax)),
                maxerrors=0,
                minreadlength=random.randint(minoverlap, readlength),
                minoverlap=minoverlap
            )
        ret = engine.findseqs(fq.fname, [seq])

        assert ret['stats']['readlengths'][readlength] == n
        assert len(ret['hits']) == n

        if 0:
            print('hits=%d'%len(ret['hits']))
            print('readlenghts='+', '.join(['%dx %dbp'%(n, idx)
                    for idx,n in enumerate(ret['stats']['readlengths']) if n]))

        seqx = ''.join([i%minoverlap!=0 and b or {'A':'C','C':'G','G':'T','T':'A'}[b]
                    for i,b in enumerate(seq)])
        ret = engine.findseqs(fq.fname, [seqx])

        if 0:
            print('0123456789'*6)
            print(('*'+' '*(minoverlap-1))*6)
            print(seq)
            print(seqx)
            print(str(ret['hits']))

        assert ret['stats']['readlengths'][readlength] == n
        assert len(ret['hits']) == 0
Example #5
0
    def test_maxerror(self):
        ''' test different values for ``maxerror`` config parameter '''
        engine.config(minreadlength=25, minoverlap=25, Amin='!')
        seqs = (
            #GAGCATGTGGAGCAACTTGTGGGAGCGCCGGGCAACGCCCTGTCTCTTAT
            "CAGCATGTGGAGCAACTTGTGGGAGCGCCGGGCAACGCCCTGTCTCTTAT",
            #^ : 1 error
            "CTGCATGTGGAGCAACTTGTGGGAGCGCCGGGCAACGCCCTGTCTCTTAT",
            #^^: 2 errors
            "CTCCATGTGGAGCAACTTGTGGGAGCGCCGGGCAACGCCCTGTCTCTTAT",
            #^^^: 3 errors
        )

        for maxerrors in range(4):
            engine.config( maxerrors=maxerrors )
            hits = engine.findseqs(self.fname, seqs)['hits']
            assert len(hits) == maxerrors
Example #6
0
    def test_minoverlap(self):
        ''' test different values for ``minoverlap`` config parameter '''
        seqs = (
            "TCGATGCGATCTGTCAAGTCGGTGGCGGTA...", # end of sequence + junk
            "TCGATGCGATCTG.CAAGTCGGTGGCGGTA...", # end of sequence + junk + 1 error
            "...NTGAACGTATCGCCTCGAGGGACTT", # junk + beginning of sequence
            "...NTGAACGTATCG.CTCGAGGGACTT", # junk + beginning of sequence + 1 error
        )

        engine.config(
                maxerrors=0,
                minreadlength=25,
                minoverlap=30,
                Amin='!'
            )
        ret = engine.findseqs(self.fname, seqs)
        hits = ret['hits']
        assert len(hits)==1 and hits[0].seq_nr==0 and hits[0].seq_pos<0

        engine.config(maxerrors=0, minoverlap=25)
        hits = engine.findseqs(self.fname, seqs)['hits']
        assert len(hits)==2
        for hit in hits:
            assert hit[0]!=3 or hit[2]>0

        engine.config(maxerrors=1, minoverlap=25)
        hits = engine.findseqs(self.fname, seqs)['hits']
        assert len(hits)==4
Example #7
0
    def test_paired(self, gz=False):
        engine.config(maxerrors=0, minoverlap=1000, minreadlength=3, Amin='!')
        seqs = (
            "CCC", # "CCCC" should be counted 2x ...
            "TTTT",
            "TATATATA",
            "TGTAG", # at beginning
            "ATATT", # at end
            "GAGCATGTGGAGCAACTTGTGGGAGCGCCGGGCAACGCCCTGTCTCTTAT",
            "...NACTTCCTCTCTACTGGTGTCGGCGGTGAAAGAGCTTACGTACTCTTCGAT...",
        )

        fname = self.fname
        fnames = (self.fname_1, self.fname_2)
        if gz:
            fname += '.gz'
            fnames = (self.fname_1 + '.gz', self.fname_2 + '.gz')

        ret = engine.findseqs(fname, seqs)
        ret_12 = engine.findseqs(fnames, seqs)

        assert ret == ret_12
Example #8
0
    def test_findseqs(self, gz=False):
        ''' find specified sequences in handwritten .fastq file '''
        engine.config(maxerrors=0, minoverlap=1000, minreadlength=3, Amin='!')
        seqs = (
            "CCC", # "CCCC" should be counted 2x ...
            "TTTT",
            "TATATATA",
            "TGTAG", # at beginning
            "ATATT", # at end
            "GAGCATGTGGAGCAACTTGTGGGAGCGCCGGGCAACGCCCTGTCTCTTAT",
            "...NACTTCCTCTCTACTGGTGTCGGCGGTGAAAGAGCTTACGTACTCTTCGAT...",
        )
        fname = self.fname
        if gz:
            fname += '.gz'
        hits = engine.findseqs(fname, seqs)['hits']

        if gz:
            f = gzip.GzipFile(fname, 'rb')
        else:
            f = file(fname, 'rb')

        x = [0] * len(seqs)

        for hit in hits:
            x[hit.seq_nr] += 1

            seq = seqs[hit.seq_nr]
            if hit.seq_pos<0:
                f.seek(hit.file_pos-hit.seq_pos)
                bps = f.read(hit.length)
            else:
                f.seek(hit.file_pos)
                bps = f.read(hit.length)
                seq = seq[hit.seq_pos:hit.seq_pos+hit[3]]

            assert bps == seq

        assert x == [19,1,0,1,1,1,1]
Example #9
0
 def setUp(self):
     engine.config(nthreads=1, maxerrors=2, minoverlap=25,
             Amin='!', Azero='!')
Example #10
0
File: cli.py Project: kvarq/kvarq
        sys.stderr.write('\n*** you must specify at least one testsuite! ***\n\n')
        sys.stderr.write('(use the -t command line switch)\n\n')
        sys.exit(ERROR_COMMAND_LINE_SWITCH)

    # prepare scanning {{{2

    try:
        fastq = Fastq(args.fastq, paired=not args.no_paired, variant=args.variant)
    except FastqFileFormatException, e:
        lo.error('cannot open file %s : %s'%(args.fastq, str(e)))
        sys.exit(ERROR_FASTQ_FORMAT_ERROR)

    engine.config(
            nthreads=args.threads,
            maxerrors=args.errors,
            Amin=fastq.Q2A(args.quality),
            Azero=fastq.Azero,
            minreadlength=args.readlength,
            minoverlap=args.overlap
        )

    analyser = analyse.Analyser()

    if not args.force:
        if os.path.exists(args.json):
            lo.error('will not overwrite file ' + args.json)
            sys.exit(ERROR_FILE_EXISTS)
        if args.extract_hits and os.path.exists(args.extract_hits):
            lo.error('will not overwrite file ' + args.extract_hits)
            sys.exit(ERROR_FILE_EXISTS)

    # do scanning {{{2