Ejemplo n.º 1
0
 def test_get_ids(self):
     '''test _get_ids'''
     infile = os.path.join(data_dir, 'cdhit_test_get_ids.fa')
     expected = {'id1', 'id2', 'id3'}
     r = cdhit.Runner(infile, 'out')
     got = r._get_ids(infile)
     self.assertEqual(expected, got)
Ejemplo n.º 2
0
 def test_get_run_cmd_with_unlimited_memory(self):
     '''test_get_run_cmd_with_unlimited_memory'''
     fa_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict_rename.in.fa')
     r = cdhit.Runner(fa_infile, memory_limit=0)
     run_cmd = r.get_run_cmd('foo/bar/file.out')
     match = re.search('^.+ -o foo/bar/file.out -c 0.9 -T 1 -s 0.0 -d 0 -bak 1 -M 0$', run_cmd)
     self.assertIsNotNone(match, msg="Command output was " + run_cmd)
Ejemplo n.º 3
0
 def test_fake_run_fail(self):
     '''test fake_run with non-unique names'''
     infile = os.path.join(data_dir, 'cdhit_test_fake_run.non-unique.in.fa')
     tmpfile = 'tmp.cdhit_test_fake_run.out.non-unique.fa'
     r = cdhit.Runner(infile, tmpfile)
     with self.assertRaises(cdhit.Error):
         clusters = r.fake_run()
     os.unlink(tmpfile)
Ejemplo n.º 4
0
 def test_run_min_cluster_number_42(self):
     '''test run with min_cluster_number 42'''
     infile = os.path.join(data_dir, 'cdhit_test_run.in.fa')
     r = cdhit.Runner(infile, min_cluster_number=42)
     clusters = r.run()
     expected_clusters = {
         '42': {'seq1', 'seq2', 'seq3'},
         '43': {'seq4'},
     }
     self.assertEqual(clusters, expected_clusters)
Ejemplo n.º 5
0
 def test_run(self):
     '''test run'''
     infile = os.path.join(data_dir, 'cdhit_test_run.in.fa')
     r = cdhit.Runner(infile)
     clusters = r.run()
     expected_clusters = {
         '0': {'seq1', 'seq2', 'seq3'},
         '1': {'seq4'},
     }
     self.assertEqual(clusters, expected_clusters)
Ejemplo n.º 6
0
 def test_rename_fasta(self):
     '''test _rename_fasta'''
     infile = os.path.join(data_dir, 'cdhit_test_rename_fasta.in.fa')
     tmpfile = 'tmp.rename_fasta.out.fa'
     expected = os.path.join(data_dir, 'cdhit_test_rename_fasta.out.fa')
     names_dict = {'a': 'seq1', 'b': 'seq2', 'c': 'seq3'}
     r = cdhit.Runner(infile, 'out')
     r._rename_fasta(infile, tmpfile, names_dict)
     self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False))
     os.unlink(tmpfile)
Ejemplo n.º 7
0
 def test_run_get_clusters_from_file(self):
     '''test run_get_clusters_from_file'''
     fa_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict.in.fa')
     clusters_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict.in.clusters')
     r = cdhit.Runner(fa_infile)
     clusters = r.run_get_clusters_from_file(clusters_infile, {'seq1', 'seq2', 'seq3'})
     expected_clusters = {
         '0': {'seq1', 'seq2'},
         '1': {'seq3'},
     }
     self.assertEqual(clusters, expected_clusters)
Ejemplo n.º 8
0
 def test_run_get_clusters_from_file_with_renaming(self):
     '''test run_get_clusters_from_file with renaming'''
     rename_dict = {'seq2': 'seq2_renamed'}
     fa_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict_rename.in.fa')
     clusters_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict.in.clusters')
     r = cdhit.Runner(fa_infile)
     clusters = r.run_get_clusters_from_file(clusters_infile, {'seq1', 'seq2_renamed', 'seq3'}, rename_dict=rename_dict)
     expected_clusters = {
         '0': {'seq1', 'seq2_renamed'},
         '1': {'seq3'},
     }
     self.assertEqual(clusters, expected_clusters)
Ejemplo n.º 9
0
 def test_enumerate_fasta(self):
     '''test _enumerate_fasta'''
     infile = os.path.join(data_dir, 'cdhit_test_enumerate_fasta.in.fa')
     expected_outfile = os.path.join(data_dir,
                                     'cdhit_test_enumerate_fasta.out.fa')
     tmpfile = 'tmp.test_enumerate_fasta.out.fa'
     expected_dict = {'1': 'a', '2': 'b', '3': 'c'}
     r = cdhit.Runner(infile, 'out')
     got_dict = r._enumerate_fasta(infile, tmpfile)
     self.assertTrue(filecmp.cmp(expected_outfile, tmpfile, shallow=False))
     self.assertEqual(expected_dict, got_dict)
     os.unlink(tmpfile)
Ejemplo n.º 10
0
 def test_run(self):
     '''test run'''
     infile = os.path.join(data_dir, 'cdhit_test_run.in.fa')
     expected_outfile = os.path.join(data_dir, 'cdhit_test_run.out.fa')
     tmpfile = 'tmp.cdhit_test_run.out.fa'
     r = cdhit.Runner(infile, tmpfile)
     clusters = r.run()
     expected_clusters = {
         '0': {'seq1', 'seq2', 'seq3'},
         '1': {'seq4'},
     }
     self.assertEqual(clusters, expected_clusters)
     self.assertTrue(filecmp.cmp(tmpfile, expected_outfile, shallow=False))
     os.unlink(tmpfile)
Ejemplo n.º 11
0
    def cluster_with_cdhit(self,
                           outprefix,
                           seq_identity_threshold=0.9,
                           threads=1,
                           length_diff_cutoff=0.0,
                           memory_limit=None,
                           nocluster=False,
                           verbose=False,
                           clusters_file=None):
        clusters = {}
        ReferenceData._write_sequences_to_files(self.sequences, self.metadata,
                                                outprefix)
        ref_types = ('noncoding', 'noncoding.varonly', 'gene', 'gene.varonly')

        for ref_type in ref_types:
            ref_file = outprefix + '.' + ref_type + '.fa'
            if os.path.getsize(ref_file) == 0:
                continue

            if len(clusters) == 0:
                min_cluster_number = 0
            else:
                min_cluster_number = 1 + max([int(x) for x in clusters.keys()])

            cdhit_runner = cdhit.Runner(
                ref_file,
                seq_identity_threshold=seq_identity_threshold,
                threads=threads,
                length_diff_cutoff=length_diff_cutoff,
                memory_limit=memory_limit,
                verbose=verbose,
                min_cluster_number=min_cluster_number,
            )

            if clusters_file is not None:
                new_clusters = cdhit_runner.run_get_clusters_from_file(
                    clusters_file,
                    self.sequences,
                    rename_dict=self.rename_dict)
            elif nocluster:
                new_clusters = cdhit_runner.fake_run()
            else:
                new_clusters = cdhit_runner.run()

            clusters.update(new_clusters)

        self.write_cluster_allocation_file(clusters,
                                           outprefix + '.clusters.tsv')
        return clusters
Ejemplo n.º 12
0
 def test_parse_cluster_info_file(self):
     '''test _parse_cluster_info_file'''
     infile = os.path.join(data_dir,
                           'cdhit_test_parse_cluster_info_file.in.fa')
     r = cdhit.Runner(infile, 'out')
     names_dict = {str(i): 'seq' + str(i) for i in range(1, 5)}
     cluster_representatives = {'1', '4'}
     cluster_file = os.path.join(
         data_dir, 'cdhit_test_parse_cluster_info_file.out.fa.bak.clstr')
     got_clusters, got_reps = r._parse_cluster_info_file(
         cluster_file, names_dict, cluster_representatives)
     expected_clusters = {'0': {'seq1', 'seq2', 'seq3'}, '1': {'seq4'}}
     expected_reps = {'1': '0', '4': '1'}
     self.assertEqual(expected_clusters, got_clusters)
     self.assertEqual(expected_reps, got_reps)
Ejemplo n.º 13
0
 def _run_cdhit(self):
     r = cdhit.Runner(
         self.db_fasta,
         self.db_fasta_clustered,
         seq_identity_threshold=self.cdhit_seq_identity_threshold,
         threads=self.threads,
         length_diff_cutoff=self.cdhit_length_diff_cutoff,
         verbose=self.verbose,
     )
     if self.run_cd_hit:
         self.cluster_ids = r.run()
     else:
         if self.verbose:
             print('Skipping cd-hit because --no_cdhit option used')
         self.cluster_ids = r.fake_run()
Ejemplo n.º 14
0
 def test_fake_run_fail(self):
     '''test fake_run with non-unique names'''
     infile = os.path.join(data_dir, 'cdhit_test_fake_run.non-unique.in.fa')
     r = cdhit.Runner(infile)
     with self.assertRaises(cdhit.Error):
         r.fake_run()
Ejemplo n.º 15
0
 def test_init_fail_invalid_memory(self):
     '''test_init_fail_invalid_memory'''
     infile = os.path.join(data_dir, 'cdhit_test_run.in.fa')
     with self.assertRaises(cdhit.Error):
         cdhit.Runner(infile, memory_limit=-10)
Ejemplo n.º 16
0
 def test_init_fail_infile_missing(self):
     '''test init_fail_infile_missing'''
     with self.assertRaises(cdhit.Error):
         cdhit.Runner('oopsnotafile', 'out')