Beispiel #1
0
    def _compute_seqs_per_file(self, input_fasta_fp, num_jobs_to_start):
        """ Compute the number of sequences to include in each split file
        """
        # count the number of sequences in the fasta file
        num_input_seqs = count_seqs(input_fasta_fp)[0]

        # divide the number of sequences by the number of jobs to start
        result = num_input_seqs / num_jobs_to_start

        # if we don't have a perfect split, round up
        if result % 1 != 0:
            result += 1

        # return the result as an integer
        return int(result)
Beispiel #2
0
    def _compute_seqs_per_file(self, input_fasta_fp, num_jobs_to_start):
        """ Compute the number of sequences to include in each split file
        """
        # count the number of sequences in the fasta file
        num_input_seqs = count_seqs(input_fasta_fp)[0]

        # divide the number of sequences by the number of jobs to start
        result = num_input_seqs / num_jobs_to_start

        # if we don't have a perfect split, round up
        if result % 1 != 0:
            result += 1

        # return the result as an integer
        return int(result)
Beispiel #3
0
    def test_run_pick_de_novo_otus_muscle(self):
        """run_pick_de_novo_otus w muscle generates expected results
        """
        self.params['assign_taxonomy'] = \
            {'id_to_taxonomy_fp': self.test_data['refseqs_tax'][0],
             'reference_seqs_fp': self.test_data['refseqs'][0]}
        self.params['align_seqs'] = {'alignment_method': 'muscle'}
        self.params['filter_alignment'] = \
            {'suppress_lane_mask_filter': None,
             'entropy_threshold': '0.10'}

        run_pick_de_novo_otus(
            self.test_data['seqs'][0],
            self.test_out,
            call_commands_serially,
            self.params,
            self.qiime_config,
            parallel=False,
            status_update_callback=no_status_updates)

        input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0]
        otu_map_fp = join(self.test_out, 'uclust_picked_otus',
                          '%s_otus.txt' % input_file_basename)
        alignment_fp = join(self.test_out,
                            'muscle_aligned_seqs', '%s_rep_set_aligned.fasta' %
                            input_file_basename)
        taxonomy_assignments_fp = join(self.test_out,
                                       'uclust_assigned_taxonomy', '%s_rep_set_tax_assignments.txt' %
                                       input_file_basename)
        otu_table_fp = join(self.test_out, 'otu_table.biom')
        tree_fp = join(self.test_out, 'rep_set.tre')

        # Number of OTUs falls within a range that was manually
        # confirmed
        otu_map_lines = list(open(otu_map_fp))
        num_otus = len(otu_map_lines)
        otu_map_otu_ids = [o.split()[0] for o in otu_map_lines]
        self.assertEqual(num_otus, 14)

        # all otus get taxonomy assignments
        taxonomy_assignment_lines = list(open(taxonomy_assignments_fp))
        self.assertEqual(len(taxonomy_assignment_lines), num_otus)

        # all OTUs align
        self.assertEqual(count_seqs(alignment_fp)[0], num_otus)

        # all OTUs in tree
        with open(tree_fp) as f:
            tree = TreeNode.from_newick(f)
        self.assertEqual(len(list(tree.tips())), num_otus)

        # check that the two final output files have non-zero size
        self.assertTrue(getsize(tree_fp) > 0)
        self.assertTrue(getsize(otu_table_fp) > 0)

        # Check that the log file is created and has size > 0
        log_fp = glob(join(self.test_out, 'log*.txt'))[0]
        self.assertTrue(getsize(log_fp) > 0)

        # parse the otu table
        otu_table = load_table(otu_table_fp)
        expected_sample_ids = [
            'f1',
            'f2',
            'f3',
            'f4',
            'p1',
            'p2',
            't1',
            't2',
            'not16S.1']
        # sample IDs are as expected
        self.assertItemsEqual(otu_table.ids(), expected_sample_ids)
        # expected OTUs
        self.assertItemsEqual(otu_table.ids(axis='observation'),
                              otu_map_otu_ids)
        # number of sequences in the full otu table equals the number of
        # input sequences
        number_seqs_in_otu_table = sum([v.sum()
                                       for v in otu_table.iter_data()])
        self.assertEqual(number_seqs_in_otu_table, count_seqs(self.test_data['seqs'][0])[0])
Beispiel #4
0
    def test_run_pick_de_novo_otus_parallel(self):
        """run_pick_de_novo_otus generates expected results in parallel
        """
        self.params['assign_taxonomy'] = \
            {'id_to_taxonomy_fp': self.test_data['refseqs_tax'][0],
             'reference_seqs_fp': self.test_data['refseqs'][0]}
        self.params['align_seqs'] = \
            {'template_fp': self.test_data['refseqs_aligned'][0]}
        self.params['filter_alignment'] = \
            {'lane_mask_fp': self.test_data['refseqs_aligned_lanemask'][0]}
        actual_tree_fp, actual_otu_table_fp = run_pick_de_novo_otus(
            self.test_data['seqs'][0],
            self.test_out,
            call_commands_serially,
            self.params,
            self.qiime_config,
            parallel=True,
            status_update_callback=no_status_updates)

        input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0]
        otu_map_fp = join(self.test_out, 'uclust_picked_otus',
                          '%s_otus.txt' % input_file_basename)
        alignment_fp = join(self.test_out,
                            'pynast_aligned_seqs', '%s_rep_set_aligned.fasta' %
                            input_file_basename)
        failures_fp = join(self.test_out,
                           'pynast_aligned_seqs', '%s_rep_set_failures.fasta' %
                           input_file_basename)
        taxonomy_assignments_fp = join(self.test_out,
                                       'uclust_assigned_taxonomy', '%s_rep_set_tax_assignments.txt' %
                                       input_file_basename)
        otu_table_fp = join(self.test_out, 'otu_table.biom')
        tree_fp = join(self.test_out, 'rep_set.tre')

        self.assertEqual(actual_tree_fp, tree_fp)
        self.assertEqual(actual_otu_table_fp, otu_table_fp)

        # Number of OTUs falls within a range that was manually
        # confirmed
        otu_map_lines = list(open(otu_map_fp))
        num_otus = len(otu_map_lines)
        otu_map_otu_ids = [o.split()[0] for o in otu_map_lines]
        self.assertEqual(num_otus, 14)

        # all otus get taxonomy assignments
        taxonomy_assignment_lines = list(open(taxonomy_assignments_fp))
        self.assertEqual(len(taxonomy_assignment_lines), num_otus)

        # number of seqs which aligned + num of seqs which failed to
        # align sum to the number of OTUs
        self.assertEqual(count_seqs(alignment_fp)[0] + count_seqs(failures_fp)[0], num_otus)

        # number of tips in the tree equals the number of sequences that
        # aligned
        with open(tree_fp) as f:
            tree = TreeNode.from_newick(f)
        self.assertEqual(len(list(tree.tips())), count_seqs(alignment_fp)[0])

        # parse the otu table
        otu_table = load_table(otu_table_fp)
        expected_sample_ids = [
            'f1',
            'f2',
            'f3',
            'f4',
            'p1',
            'p2',
            't1',
            't2',
            'not16S.1']
        # sample IDs are as expected
        self.assertItemsEqual(otu_table.ids(), expected_sample_ids)
        # otu ids are as expected
        self.assertItemsEqual(otu_table.ids(axis='observation'),
                              otu_map_otu_ids)
        # number of sequences in the full otu table equals the number of
        # input sequences
        number_seqs_in_otu_table = sum([v.sum()
                                       for v in otu_table.iter_data()])
        self.assertEqual(number_seqs_in_otu_table, count_seqs(self.test_data['seqs'][0])[0])

        # Check that the log file is created and has size > 0
        log_fp = glob(join(self.test_out, 'log*.txt'))[0]
        self.assertTrue(getsize(log_fp) > 0)
Beispiel #5
0
    def test_run_pick_de_novo_otus_muscle(self):
        """run_pick_de_novo_otus w muscle generates expected results
        """
        self.params['assign_taxonomy'] = \
            {'id_to_taxonomy_fp': self.test_data['refseqs_tax'][0],
             'reference_seqs_fp': self.test_data['refseqs'][0]}
        self.params['align_seqs'] = {'alignment_method': 'muscle'}
        self.params['filter_alignment'] = \
            {'suppress_lane_mask_filter': None,
             'entropy_threshold': '0.10'}

        run_pick_de_novo_otus(self.test_data['seqs'][0],
                              self.test_out,
                              call_commands_serially,
                              self.params,
                              self.qiime_config,
                              parallel=False,
                              status_update_callback=no_status_updates)

        input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0]
        otu_map_fp = join(self.test_out, 'uclust_picked_otus',
                          '%s_otus.txt' % input_file_basename)
        alignment_fp = join(self.test_out, 'muscle_aligned_seqs',
                            '%s_rep_set_aligned.fasta' % input_file_basename)
        taxonomy_assignments_fp = join(
            self.test_out, 'uclust_assigned_taxonomy',
            '%s_rep_set_tax_assignments.txt' % input_file_basename)
        otu_table_fp = join(self.test_out, 'otu_table.biom')
        tree_fp = join(self.test_out, 'rep_set.tre')

        # Number of OTUs falls within a range that was manually
        # confirmed
        otu_map_lines = list(open(otu_map_fp))
        num_otus = len(otu_map_lines)
        otu_map_otu_ids = [o.split()[0] for o in otu_map_lines]
        self.assertEqual(num_otus, 14)

        # all otus get taxonomy assignments
        taxonomy_assignment_lines = list(open(taxonomy_assignments_fp))
        self.assertEqual(len(taxonomy_assignment_lines), num_otus)

        # all OTUs align
        self.assertEqual(count_seqs(alignment_fp)[0], num_otus)

        # all OTUs in tree
        with open(tree_fp) as f:
            tree = TreeNode.from_newick(f)
        self.assertEqual(len(list(tree.tips())), num_otus)

        # check that the two final output files have non-zero size
        self.assertTrue(getsize(tree_fp) > 0)
        self.assertTrue(getsize(otu_table_fp) > 0)

        # Check that the log file is created and has size > 0
        log_fp = glob(join(self.test_out, 'log*.txt'))[0]
        self.assertTrue(getsize(log_fp) > 0)

        # parse the otu table
        otu_table = load_table(otu_table_fp)
        expected_sample_ids = [
            'f1', 'f2', 'f3', 'f4', 'p1', 'p2', 't1', 't2', 'not16S.1'
        ]
        # sample IDs are as expected
        self.assertItemsEqual(otu_table.ids(), expected_sample_ids)
        # expected OTUs
        self.assertItemsEqual(otu_table.ids(axis='observation'),
                              otu_map_otu_ids)
        # number of sequences in the full otu table equals the number of
        # input sequences
        number_seqs_in_otu_table = sum(
            [v.sum() for v in otu_table.iter_data()])
        self.assertEqual(number_seqs_in_otu_table,
                         count_seqs(self.test_data['seqs'][0])[0])
Beispiel #6
0
    def test_run_pick_de_novo_otus_parallel(self):
        """run_pick_de_novo_otus generates expected results in parallel
        """
        self.params['assign_taxonomy'] = \
            {'id_to_taxonomy_fp': self.test_data['refseqs_tax'][0],
             'reference_seqs_fp': self.test_data['refseqs'][0]}
        self.params['align_seqs'] = \
            {'template_fp': self.test_data['refseqs_aligned'][0]}
        self.params['filter_alignment'] = \
            {'lane_mask_fp': self.test_data['refseqs_aligned_lanemask'][0]}
        actual_tree_fp, actual_otu_table_fp = run_pick_de_novo_otus(
            self.test_data['seqs'][0],
            self.test_out,
            call_commands_serially,
            self.params,
            self.qiime_config,
            parallel=True,
            status_update_callback=no_status_updates)

        input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0]
        otu_map_fp = join(self.test_out, 'uclust_picked_otus',
                          '%s_otus.txt' % input_file_basename)
        alignment_fp = join(self.test_out, 'pynast_aligned_seqs',
                            '%s_rep_set_aligned.fasta' % input_file_basename)
        failures_fp = join(self.test_out, 'pynast_aligned_seqs',
                           '%s_rep_set_failures.fasta' % input_file_basename)
        taxonomy_assignments_fp = join(
            self.test_out, 'uclust_assigned_taxonomy',
            '%s_rep_set_tax_assignments.txt' % input_file_basename)
        otu_table_fp = join(self.test_out, 'otu_table.biom')
        tree_fp = join(self.test_out, 'rep_set.tre')

        self.assertEqual(actual_tree_fp, tree_fp)
        self.assertEqual(actual_otu_table_fp, otu_table_fp)

        # Number of OTUs falls within a range that was manually
        # confirmed
        otu_map_lines = list(open(otu_map_fp))
        num_otus = len(otu_map_lines)
        otu_map_otu_ids = [o.split()[0] for o in otu_map_lines]
        self.assertEqual(num_otus, 14)

        # all otus get taxonomy assignments
        taxonomy_assignment_lines = list(open(taxonomy_assignments_fp))
        self.assertEqual(len(taxonomy_assignment_lines), num_otus)

        # number of seqs which aligned + num of seqs which failed to
        # align sum to the number of OTUs
        self.assertEqual(
            count_seqs(alignment_fp)[0] + count_seqs(failures_fp)[0], num_otus)

        # number of tips in the tree equals the number of sequences that
        # aligned
        with open(tree_fp) as f:
            tree = TreeNode.from_newick(f)
        self.assertEqual(len(list(tree.tips())), count_seqs(alignment_fp)[0])

        # parse the otu table
        otu_table = load_table(otu_table_fp)
        expected_sample_ids = [
            'f1', 'f2', 'f3', 'f4', 'p1', 'p2', 't1', 't2', 'not16S.1'
        ]
        # sample IDs are as expected
        self.assertItemsEqual(otu_table.ids(), expected_sample_ids)
        # otu ids are as expected
        self.assertItemsEqual(otu_table.ids(axis='observation'),
                              otu_map_otu_ids)
        # number of sequences in the full otu table equals the number of
        # input sequences
        number_seqs_in_otu_table = sum(
            [v.sum() for v in otu_table.iter_data()])
        self.assertEqual(number_seqs_in_otu_table,
                         count_seqs(self.test_data['seqs'][0])[0])

        # Check that the log file is created and has size > 0
        log_fp = glob(join(self.test_out, 'log*.txt'))[0]
        self.assertTrue(getsize(log_fp) > 0)