Esempio n. 1
0
 def test_write_fasta(self):
     '''Test write_fasta'''
     s = seed.Seed(seq='GAAGGCGGCAGC')
     tmpfile = 'tmp.seed.fa'
     s.write_fasta(tmpfile, 'spam')
     self.assertTrue(filecmp.cmp(tmpfile, os.path.join(data_dir, 'seed_test.write_fasta.fa'), shallow=False))
     os.unlink(tmpfile)
Esempio n. 2
0
 def test_extend_with_reads_as_single_end(self):
     '''Test _extend_with_reads_as_single_end'''
     s = seed.Seed(seq='AGGCT', ext_min_cov=1, verbose=2)
     reads1 = os.path.join(data_dir, 'kcount_test.reads_1.fasta')
     reads2 = os.path.join(data_dir, 'kcount_test.reads_2.fasta')
     s._extend_with_reads_as_single_end(reads1, reads2)
     self.assertEqual('TGAGGCTAT', s.seq)
Esempio n. 3
0
 def test_extensions_from_reads_file(self):
     '''Test _extensions_from_reads_file'''
     s = seed.Seed(seq='AGGCT')
     l, r = s._extensions_from_reads_file(os.path.join(data_dir, 'kcount_test.reads_1.fasta'))
     self.assertListEqual(l, [])
     self.assertListEqual(r, ['A', 'AT', 'AT'])
     l, r = s._extensions_from_reads_file(os.path.join(data_dir, 'kcount_test.reads_2.fasta'))
     self.assertListEqual(l, ['G', 'TG', 'TG'])
     self.assertListEqual(r, [])
Esempio n. 4
0
    def add_new_seed_contig(self, reads1, reads2, contig_name=None, max_attempts=10):
        if len(self.contigs):
            tmpdir = tempfile.mkdtemp(prefix='tmp.make_seed.', dir=os.getcwd())
            tmp_prefix = os.path.join(tmpdir, 'out')
            seed_reads1 = tmp_prefix + '_1.fa'
            seed_reads2 = tmp_prefix + '_2.fa'
            if contig_name is not None:
                self._map_reads(reads1, reads2, tmp_prefix, required_flag=5, exclude_flag=8, mate_ref=contig_name)
                mapping.bam_to_fasta(tmp_prefix + '.bam', seed_reads1)
                seed_reads2 = None
            else:
                self._get_unmapped_pairs(reads1, reads2, tmp_prefix)
        else:
            seed_reads1 = reads1
            seed_reads2 = reads2

        made_seed = False

        for i in range(max_attempts):
            s = seed.Seed(reads1=seed_reads1, reads2=seed_reads2, extend_length=self.seed_ext_max_bases, seed_length=self.seed_start_length, seed_min_count=self.seed_min_kmer_count, seed_max_count=self.seed_max_kmer_count, ext_min_cov=self.seed_min_cov, ext_min_ratio=self.seed_min_ratio, verbose=self.verbose, kmc_threads=self.kmc_threads, map_threads=self.threads, sequences_to_ignore=self.used_seeds, contigs_to_check=self.contigs)

            if s.seq is None or len(s.seq) == 0:
                break

            if self.seed_overlap_length is None:
                s.overlap_length = len(s.seq)
            else:
                s.overlap_length = self.seed_overlap_length
            s.extend(reads1, reads2, self.seed_stop_length)
            self.used_seeds.add(s.seq)

            if len(s.seq) >= 0.75 * self.seed_stop_length:
                made_seed = True
                break
            elif self.verbose:
                print("    Couldn't extend seed enough. That was attempt", i+1, 'of', max_attempts, flush=True)

        if len(self.contigs):
            shutil.rmtree(tmpdir)

        if not made_seed or len(s.seq) == 0:
            return None

        if self.verbose:
            print("    Extended seed OK.", flush=True)
        new_name = 'seeded.' + '1'.zfill(5)
        i = 1
        while new_name in self.contigs:
            i += 1
            new_name = 'seeded.' + str(i).zfill(5)

        self._add_contig(pyfastaq.sequences.Fasta(new_name, s.seq))
        return new_name
Esempio n. 5
0
    def test_extension_from_read(self):
        '''Test _test_extension_from_read'''
        s = seed.Seed(seq='AGGCT')
        self.assertEqual(None, s._extension_from_read(pyfastaq.sequences.Fasta('x', 'AAAAA')))
        self.assertEqual(None, s._extension_from_read(pyfastaq.sequences.Fasta('x', 'AGGC')))
        self.assertEqual('A', s._extension_from_read(pyfastaq.sequences.Fasta('x', 'AGGCTA')))
        self.assertEqual('AT', s._extension_from_read(pyfastaq.sequences.Fasta('x', 'AGGCTAT')))
        self.assertEqual('AT', s._extension_from_read(pyfastaq.sequences.Fasta('x', 'GGGAGGCTAT')))
        self.assertEqual('AA', s._extension_from_read(pyfastaq.sequences.Fasta('x', 'TTAGCCT')))

        self.assertEqual(None, s._extension_from_read(pyfastaq.sequences.Fasta('x', 'AAAAA'), left=True))
        self.assertEqual(None, s._extension_from_read(pyfastaq.sequences.Fasta('x', 'AGGCTA'), left=True))
        self.assertEqual('GT', s._extension_from_read(pyfastaq.sequences.Fasta('x', 'GTAGGCTA'), left=True))
        self.assertEqual('GT', s._extension_from_read(pyfastaq.sequences.Fasta('x', 'GTAGGCTATTC'), left=True))
        self.assertEqual('GT', s._extension_from_read(pyfastaq.sequences.Fasta('x', 'AGCCTAC'), left=True))
Esempio n. 6
0
    def _make_new_seed(self, seed_name):
        if self.verbose:
            print('Making new seed for', seed_name, ' ... start')
        tmp_prefix = os.path.join(self.tmpdir, 'out')
        seed_reads = tmp_prefix + '.' + seed_name + '.reads_1.fa'
        if len(self.original_seeds[seed_name]) > self.seed_stop_length:
            start = int(0.5 * len(self.original_seeds[seed_name]) -
                        0.5 * self.seed_stop_length)
            end = int(0.5 * len(self.original_seeds[seed_name]) +
                      0.5 * self.seed_stop_length)
        else:
            start = None
            end = None
        if self.verbose:
            print('Making new seed for', seed_name, ' ... getting reads')
        mapping.bam_file_to_region_fasta(self.bam_file, seed_reads, seed_name,
                                         start, end)
        if self.verbose:
            print('Making new seed for', seed_name,
                  ' ... finding most common kmer')
        new_seed = seed.Seed(extend_length=self.extend_length,
                             overlap_length=self.overlap_length,
                             reads1=seed_reads,
                             ext_min_cov=self.ext_min_cov,
                             ext_min_ratio=self.ext_min_ratio,
                             verbose=self.verbose,
                             seed_length=self.seed_length,
                             seed_min_count=self.seed_min_count,
                             seed_max_count=self.seed_max_count,
                             kmc_threads=self.kmc_threads,
                             map_threads=self.threads)
        if len(new_seed) == 0:
            print('Warning: could not get most common kmer for', seed_name)
            return

        if self.verbose:
            print('Making new seed for', seed_name,
                  ' ... extending most common kmer')

        new_seed.extend(self.reads1, self.reads2, self.seed_stop_length)
        f = pyfastaq.utils.open_file_write(tmp_prefix + '.' + seed_name +
                                           '.fa')
        print(pyfastaq.sequences.Fasta('seed.' + seed_name,
                                       new_seed.seq[10:-10]),
              file=f)
        pyfastaq.utils.close(f)
        if self.verbose:
            print('Making new seed for', seed_name, ' ... finished')
Esempio n. 7
0
 def test_len(self):
     '''Test len'''
     s = seed.Seed(seq='AGGCT')
     self.assertEqual(5, len(s))
     s.seq = None
     self.assertEqual(0, len(s))