예제 #1
0
    def _map_and_cluster_reads(self):
        if self.verbose:
            print('{:_^79}'.format(' Mapping reads to clustered genes '),
                  flush=True)

        minimap_prefix = 'minimap'

        self._minimap_reads_to_all_ref_seqs(self.clusters_tsv,
                                            self.all_ref_seqs_fasta,
                                            self.reads_1,
                                            self.reads_2,
                                            minimap_prefix,
                                            verbose=self.verbose)

        if self.verbose:
            print('Finished mapping\n')
            print('{:_^79}'.format(' Generating clusters '), flush=True)

        self.cluster_to_rep, self.cluster_read_counts, self.cluster_base_counts, self.insert_hist, self.proper_pairs = self._load_minimap_files(
            minimap_prefix, self.insert_hist_bin)
        self.cluster_to_dir = {
            x: os.path.join(self.tmp_dir, x)
            for x in self.cluster_to_rep
        }
        reads_file_for_read_store = minimap_prefix + '.reads'

        if len(self.cluster_read_counts):
            if self.verbose:
                filehandle = sys.stdout
            else:
                filehandle = None

            self.read_store = read_store.ReadStore(reads_file_for_read_store,
                                                   os.path.join(
                                                       self.outdir,
                                                       'read_store'),
                                                   log_fh=filehandle)

        os.unlink(reads_file_for_read_store)

        if self.clean:
            for suffix in [
                    'cluster2representative', 'clusterCounts',
                    'insertHistogram', 'properPairs'
            ]:
                filename = minimap_prefix + '.' + suffix
                try:
                    os.unlink(filename)
                except:
                    pass

        if self.verbose:
            print('Found', self.proper_pairs, 'proper read pairs from minimap')
            print('Total clusters to perform local assemblies:',
                  len(self.cluster_to_dir),
                  flush=True)
예제 #2
0
 def test_get_reads_fq_interleave(self):
     '''Test get_reads fastq interleaved'''
     infile = os.path.join(data_dir, 'read_store_test_get_reads.in')
     expected = os.path.join(data_dir,
                             'read_store_test_get_reads.expected.reads.fq')
     outprefix = 'tmp.read_store_test_get_reads'
     reads = outprefix + '.reads_1.fq'
     rstore = read_store.ReadStore(infile, outprefix)
     got_reads, got_bases = rstore.get_reads('cluster2', reads)
     self.assertEqual(6, got_reads)
     self.assertEqual(24, got_bases)
     self.assertTrue(filecmp.cmp(expected, reads))
     os.unlink(outprefix + '.gz')
     os.unlink(outprefix + '.gz.tbi')
     os.unlink(reads)
예제 #3
0
 def test_clean(self):
     '''Test clean'''
     infile = os.path.join(data_dir, 'read_store_test_clean.in')
     outprefix = 'tmp.read_store_test_clean'
     self.assertFalse(os.path.exists(outprefix))
     self.assertFalse(os.path.exists(outprefix + '.gz'))
     self.assertFalse(os.path.exists(outprefix + '.gz.tbi'))
     rstore = read_store.ReadStore(infile, outprefix)
     self.assertFalse(os.path.exists(outprefix))
     self.assertTrue(os.path.exists(outprefix + '.gz'))
     self.assertTrue(os.path.exists(outprefix + '.gz.tbi'))
     rstore.clean()
     self.assertFalse(os.path.exists(outprefix))
     self.assertFalse(os.path.exists(outprefix + '.gz'))
     self.assertFalse(os.path.exists(outprefix + '.gz.tbi'))
예제 #4
0
 def _test_run(self):
     '''test run'''
     rstore_infile = os.path.join(data_dir, 'read_filter_test_run.in.read_store')
     ref_fasta = os.path.join(data_dir, 'read_filter_test_run.in.ref.fa')
     expected_reads1 = os.path.join(data_dir, 'read_filter_test_run.expected.reads_1.fq')
     expected_reads2 = os.path.join(data_dir, 'read_filter_test_run.expected.reads_2.fq')
     tmp_rstore_prefix = 'tmp.filter_test_run.read_store'
     tmp_reads1 = 'tmp.filter_test_run.reads_1.fq'
     tmp_reads2 = 'tmp.filter_test_run.reads_2.fq'
     rstore = read_store.ReadStore(rstore_infile, tmp_rstore_prefix)
     rfilter = read_filter.ReadFilter(rstore, ref_fasta, '1', sys.stdout)
     got_reads, got_bases = rfilter.run(tmp_reads1, tmp_reads2)
     self.assertEqual(12, got_reads)
     self.assertEqual(912, got_bases)
     self.assertTrue(filecmp.cmp(expected_reads1, tmp_reads1, shallow=False))
     self.assertTrue(filecmp.cmp(expected_reads2, tmp_reads2, shallow=False))
     os.unlink(tmp_reads1)
     os.unlink(tmp_reads2)
     rstore.clean()
예제 #5
0
 def test_get_reads_fq_pair(self):
     '''Test get_reads fastq pair'''
     infile = os.path.join(data_dir, 'read_store_test_get_reads.in')
     expected1 = os.path.join(
         data_dir, 'read_store_test_get_reads.expected.reads_1.fq')
     expected2 = os.path.join(
         data_dir, 'read_store_test_get_reads.expected.reads_2.fq')
     outprefix = 'tmp.read_store_test_get_reads'
     reads1 = outprefix + '.reads_1.fq'
     reads2 = outprefix + '.reads_2.fq'
     rstore = read_store.ReadStore(infile, outprefix)
     got_reads, got_bases = rstore.get_reads('cluster2',
                                             reads1,
                                             out2=reads2)
     self.assertEqual(6, got_reads)
     self.assertEqual(24, got_bases)
     self.assertTrue(filecmp.cmp(expected1, reads1))
     self.assertTrue(filecmp.cmp(expected2, reads2))
     os.unlink(outprefix + '.gz')
     os.unlink(outprefix + '.gz.tbi')
     os.unlink(reads1)
     os.unlink(reads2)
예제 #6
0
 def test_get_reads_subset(self):
     '''Test get_reads subset'''
     infile = os.path.join(data_dir, 'read_store_test_get_reads.in')
     expected1 = os.path.join(
         data_dir, 'read_store_test_get_reads.expected.reads_subset.1.fq')
     expected2 = os.path.join(
         data_dir, 'read_store_test_get_reads.expected.reads_subset.2.fq')
     wanted_ids = {1, 11}
     outprefix = 'tmp.read_store_test_get_reads'
     reads1 = outprefix + '.reads_1.fq'
     reads2 = outprefix + '.reads_2.fq'
     rstore = read_store.ReadStore(infile, outprefix)
     got_reads, got_bases = rstore.get_reads('cluster2',
                                             reads1,
                                             out2=reads2,
                                             wanted_ids=wanted_ids)
     self.assertEqual(4, got_reads)
     self.assertEqual(16, got_bases)
     self.assertTrue(filecmp.cmp(expected1, reads1))
     self.assertTrue(filecmp.cmp(expected2, reads2))
     os.unlink(outprefix + '.gz')
     os.unlink(outprefix + '.gz.tbi')
     os.unlink(reads1)
     os.unlink(reads2)