Beispiel #1
0
    def test_run_pick_de_novo_otus_parallel(self):
        """run_pick_de_novo_otus generates expected results in parallel
        """
        self.params['assign_taxonomy'] = \
            {'id_to_taxonomy_fp': self.test_data['refseqs_tax'][0],
             'reference_seqs_fp': self.test_data['refseqs'][0]}
        self.params['align_seqs'] = \
            {'template_fp': self.test_data['refseqs_aligned'][0]}
        self.params['filter_alignment'] = \
            {'lane_mask_fp': self.test_data['refseqs_aligned_lanemask'][0]}
        actual_tree_fp, actual_otu_table_fp = run_pick_de_novo_otus(
            self.test_data['seqs'][0],
            self.test_out,
            call_commands_serially,
            self.params,
            self.qiime_config,
            parallel=True,
            status_update_callback=no_status_updates)

        input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0]
        otu_map_fp = join(self.test_out, 'uclust_picked_otus',
                          '%s_otus.txt' % input_file_basename)
        alignment_fp = join(self.test_out, 'pynast_aligned_seqs',
                            '%s_rep_set_aligned.fasta' % input_file_basename)
        failures_fp = join(self.test_out, 'pynast_aligned_seqs',
                           '%s_rep_set_failures.fasta' % input_file_basename)
        taxonomy_assignments_fp = join(
            self.test_out, 'uclust_assigned_taxonomy',
            '%s_rep_set_tax_assignments.txt' % input_file_basename)
        otu_table_fp = join(self.test_out, 'otu_table.biom')
        tree_fp = join(self.test_out, 'rep_set.tre')

        self.assertEqual(actual_tree_fp, tree_fp)
        self.assertEqual(actual_otu_table_fp, otu_table_fp)

        input_seqs = LoadSeqs(self.test_data['seqs'][0],
                              format='fasta',
                              aligned=False)

        # Number of OTUs falls within a range that was manually
        # confirmed
        otu_map_lines = list(open(otu_map_fp))
        num_otus = len(otu_map_lines)
        otu_map_otu_ids = [o.split()[0] for o in otu_map_lines]
        self.assertEqual(num_otus, 14)

        # all otus get taxonomy assignments
        taxonomy_assignment_lines = list(open(taxonomy_assignments_fp))
        self.assertEqual(len(taxonomy_assignment_lines), num_otus)

        # number of seqs which aligned + num of seqs which failed to
        # align sum to the number of OTUs
        aln = LoadSeqs(alignment_fp)
        failures = LoadSeqs(failures_fp, aligned=False)
        self.assertTrue(aln.getNumSeqs() + failures.getNumSeqs(), num_otus)

        # number of tips in the tree equals the number of sequences that
        # aligned
        tree = LoadTree(tree_fp)
        self.assertEqual(len(tree.tips()), aln.getNumSeqs())

        # parse the otu table
        otu_table = parse_biom_table(open(otu_table_fp, 'U'))
        expected_sample_ids = [
            'f1', 'f2', 'f3', 'f4', 'p1', 'p2', 't1', 't2', 'not16S.1'
        ]
        # sample IDs are as expected
        self.assertEqualItems(otu_table.SampleIds, expected_sample_ids)
        # otu ids are as expected
        self.assertEqualItems(otu_table.ObservationIds, otu_map_otu_ids)
        # number of sequences in the full otu table equals the number of
        # input sequences
        number_seqs_in_otu_table = sum(
            [v.sum() for v in otu_table.iterSampleData()])
        self.assertEqual(number_seqs_in_otu_table, input_seqs.getNumSeqs())

        # Check that the log file is created and has size > 0
        log_fp = glob(join(self.test_out, 'log*.txt'))[0]
        self.assertTrue(getsize(log_fp) > 0)
Beispiel #2
0
    def test_run_pick_de_novo_otus_muscle(self):
        """run_pick_de_novo_otus w muscle generates expected results
        """
        self.params['assign_taxonomy'] = \
            {'id_to_taxonomy_fp': self.test_data['refseqs_tax'][0],
             'reference_seqs_fp': self.test_data['refseqs'][0]}
        self.params['align_seqs'] = {'alignment_method': 'muscle'}
        self.params['filter_alignment'] = \
            {'suppress_lane_mask_filter': None,
             'entropy_threshold': '0.10'}

        run_pick_de_novo_otus(self.test_data['seqs'][0],
                              self.test_out,
                              call_commands_serially,
                              self.params,
                              self.qiime_config,
                              parallel=False,
                              status_update_callback=no_status_updates)

        input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0]
        otu_map_fp = join(self.test_out, 'uclust_picked_otus',
                          '%s_otus.txt' % input_file_basename)
        alignment_fp = join(self.test_out, 'muscle_aligned_seqs',
                            '%s_rep_set_aligned.fasta' % input_file_basename)
        taxonomy_assignments_fp = join(
            self.test_out, 'uclust_assigned_taxonomy',
            '%s_rep_set_tax_assignments.txt' % input_file_basename)
        otu_table_fp = join(self.test_out, 'otu_table.biom')
        tree_fp = join(self.test_out, 'rep_set.tre')

        input_seqs = LoadSeqs(self.test_data['seqs'][0],
                              format='fasta',
                              aligned=False)

        # Number of OTUs falls within a range that was manually
        # confirmed
        otu_map_lines = list(open(otu_map_fp))
        num_otus = len(otu_map_lines)
        otu_map_otu_ids = [o.split()[0] for o in otu_map_lines]
        self.assertEqual(num_otus, 14)

        # all otus get taxonomy assignments
        taxonomy_assignment_lines = list(open(taxonomy_assignments_fp))
        self.assertEqual(len(taxonomy_assignment_lines), num_otus)

        # all OTUs align
        aln = LoadSeqs(alignment_fp)
        self.assertTrue(aln.getNumSeqs(), num_otus)

        # all OTUs in tree
        tree = LoadTree(tree_fp)
        self.assertEqual(len(tree.tips()), num_otus)

        # check that the two final output files have non-zero size
        self.assertTrue(getsize(tree_fp) > 0)
        self.assertTrue(getsize(otu_table_fp) > 0)

        # Check that the log file is created and has size > 0
        log_fp = glob(join(self.test_out, 'log*.txt'))[0]
        self.assertTrue(getsize(log_fp) > 0)

        # parse the otu table
        otu_table = parse_biom_table(open(otu_table_fp, 'U'))
        expected_sample_ids = [
            'f1', 'f2', 'f3', 'f4', 'p1', 'p2', 't1', 't2', 'not16S.1'
        ]
        # sample IDs are as expected
        self.assertEqualItems(otu_table.SampleIds, expected_sample_ids)
        # expected OTUs
        self.assertEqualItems(otu_table.ObservationIds, otu_map_otu_ids)
        # number of sequences in the full otu table equals the number of
        # input sequences
        number_seqs_in_otu_table = sum(
            [v.sum() for v in otu_table.iterSampleData()])
        self.assertEqual(number_seqs_in_otu_table, input_seqs.getNumSeqs())
Beispiel #3
0
    def test_run_pick_de_novo_otus_muscle(self):
        """run_pick_de_novo_otus w muscle generates expected results
        """
        self.params['assign_taxonomy'] = \
            {'id_to_taxonomy_fp': self.test_data['refseqs_tax'][0],
             'reference_seqs_fp': self.test_data['refseqs'][0]}
        self.params['align_seqs'] = {'alignment_method': 'muscle'}
        self.params['filter_alignment'] = \
            {'suppress_lane_mask_filter': None,
             'entropy_threshold': '0.10'}

        run_pick_de_novo_otus(
            self.test_data['seqs'][0],
            self.test_out,
            call_commands_serially,
            self.params,
            self.qiime_config,
            parallel=False,
            status_update_callback=no_status_updates)

        input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0]
        otu_map_fp = join(self.test_out, 'uclust_picked_otus',
                          '%s_otus.txt' % input_file_basename)
        alignment_fp = join(self.test_out,
                            'muscle_aligned_seqs', '%s_rep_set_aligned.fasta' %
                            input_file_basename)
        taxonomy_assignments_fp = join(self.test_out,
                                       'uclust_assigned_taxonomy', '%s_rep_set_tax_assignments.txt' %
                                       input_file_basename)
        otu_table_fp = join(self.test_out, 'otu_table.biom')
        tree_fp = join(self.test_out, 'rep_set.tre')

        # Number of OTUs falls within a range that was manually
        # confirmed
        otu_map_lines = list(open(otu_map_fp))
        num_otus = len(otu_map_lines)
        otu_map_otu_ids = [o.split()[0] for o in otu_map_lines]
        self.assertEqual(num_otus, 14)

        # all otus get taxonomy assignments
        taxonomy_assignment_lines = list(open(taxonomy_assignments_fp))
        self.assertEqual(len(taxonomy_assignment_lines), num_otus)

        # all OTUs align
        self.assertEqual(count_seqs(alignment_fp)[0], num_otus)

        # all OTUs in tree
        with open(tree_fp) as f:
            tree = TreeNode.from_newick(f)
        self.assertEqual(len(list(tree.tips())), num_otus)

        # check that the two final output files have non-zero size
        self.assertTrue(getsize(tree_fp) > 0)
        self.assertTrue(getsize(otu_table_fp) > 0)

        # Check that the log file is created and has size > 0
        log_fp = glob(join(self.test_out, 'log*.txt'))[0]
        self.assertTrue(getsize(log_fp) > 0)

        # parse the otu table
        otu_table = load_table(otu_table_fp)
        expected_sample_ids = [
            'f1',
            'f2',
            'f3',
            'f4',
            'p1',
            'p2',
            't1',
            't2',
            'not16S.1']
        # sample IDs are as expected
        self.assertItemsEqual(otu_table.ids(), expected_sample_ids)
        # expected OTUs
        self.assertItemsEqual(otu_table.ids(axis='observation'),
                              otu_map_otu_ids)
        # number of sequences in the full otu table equals the number of
        # input sequences
        number_seqs_in_otu_table = sum([v.sum()
                                       for v in otu_table.iter_data()])
        self.assertEqual(number_seqs_in_otu_table, count_seqs(self.test_data['seqs'][0])[0])
Beispiel #4
0
    def test_run_pick_de_novo_otus_parallel(self):
        """run_pick_de_novo_otus generates expected results in parallel
        """
        self.params['assign_taxonomy'] = \
            {'id_to_taxonomy_fp': self.test_data['refseqs_tax'][0],
             'reference_seqs_fp': self.test_data['refseqs'][0]}
        self.params['align_seqs'] = \
            {'template_fp': self.test_data['refseqs_aligned'][0]}
        self.params['filter_alignment'] = \
            {'lane_mask_fp': self.test_data['refseqs_aligned_lanemask'][0]}
        actual_tree_fp, actual_otu_table_fp = run_pick_de_novo_otus(
            self.test_data['seqs'][0],
            self.test_out,
            call_commands_serially,
            self.params,
            self.qiime_config,
            parallel=True,
            status_update_callback=no_status_updates)

        input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0]
        otu_map_fp = join(self.test_out, 'uclust_picked_otus',
                          '%s_otus.txt' % input_file_basename)
        alignment_fp = join(self.test_out,
                            'pynast_aligned_seqs', '%s_rep_set_aligned.fasta' %
                            input_file_basename)
        failures_fp = join(self.test_out,
                           'pynast_aligned_seqs', '%s_rep_set_failures.fasta' %
                           input_file_basename)
        taxonomy_assignments_fp = join(self.test_out,
                                       'uclust_assigned_taxonomy', '%s_rep_set_tax_assignments.txt' %
                                       input_file_basename)
        otu_table_fp = join(self.test_out, 'otu_table.biom')
        tree_fp = join(self.test_out, 'rep_set.tre')

        self.assertEqual(actual_tree_fp, tree_fp)
        self.assertEqual(actual_otu_table_fp, otu_table_fp)

        # Number of OTUs falls within a range that was manually
        # confirmed
        otu_map_lines = list(open(otu_map_fp))
        num_otus = len(otu_map_lines)
        otu_map_otu_ids = [o.split()[0] for o in otu_map_lines]
        self.assertEqual(num_otus, 14)

        # all otus get taxonomy assignments
        taxonomy_assignment_lines = list(open(taxonomy_assignments_fp))
        self.assertEqual(len(taxonomy_assignment_lines), num_otus)

        # number of seqs which aligned + num of seqs which failed to
        # align sum to the number of OTUs
        self.assertEqual(count_seqs(alignment_fp)[0] + count_seqs(failures_fp)[0], num_otus)

        # number of tips in the tree equals the number of sequences that
        # aligned
        with open(tree_fp) as f:
            tree = TreeNode.from_newick(f)
        self.assertEqual(len(list(tree.tips())), count_seqs(alignment_fp)[0])

        # parse the otu table
        otu_table = load_table(otu_table_fp)
        expected_sample_ids = [
            'f1',
            'f2',
            'f3',
            'f4',
            'p1',
            'p2',
            't1',
            't2',
            'not16S.1']
        # sample IDs are as expected
        self.assertItemsEqual(otu_table.ids(), expected_sample_ids)
        # otu ids are as expected
        self.assertItemsEqual(otu_table.ids(axis='observation'),
                              otu_map_otu_ids)
        # number of sequences in the full otu table equals the number of
        # input sequences
        number_seqs_in_otu_table = sum([v.sum()
                                       for v in otu_table.iter_data()])
        self.assertEqual(number_seqs_in_otu_table, count_seqs(self.test_data['seqs'][0])[0])

        # Check that the log file is created and has size > 0
        log_fp = glob(join(self.test_out, 'log*.txt'))[0]
        self.assertTrue(getsize(log_fp) > 0)
Beispiel #5
0
    def test_run_pick_de_novo_otus_swarm_rdp_tax_assign_modify_resolution(
            self):
        """run_pick_de_novo_otus using swarm (d=4 which approximates 97%% id)
           generates expected results with rdp tax assignment
        """

        self.params['assign_taxonomy'] = \
            {'id_to_taxonomy_fp': self.test_data['refseqs_tax'][0],
             'reference_seqs_fp': self.test_data['refseqs'][0],
             'assignment_method': 'rdp'}
        self.params['align_seqs'] = \
            {'template_fp': self.test_data['refseqs_aligned'][0]}
        self.params['filter_alignment'] = \
            {'lane_mask_fp': self.test_data['refseqs_aligned_lanemask'][0]}
        self.params['pick_otus']['otu_picking_method'] = "swarm"
        # Swarm resolution = 4 is about 97% similarity for the
        # sequences clustered in this test
        self.params['pick_otus']['swarm_resolution'] = "4"

        actual_tree_fp, actual_otu_table_fp = run_pick_de_novo_otus(
            self.test_data['seqs'][0],
            self.test_out,
            call_commands_serially,
            self.params,
            self.qiime_config,
            parallel=False,
            status_update_callback=no_status_updates)

        input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0]
        otu_map_fp = join(self.test_out, 'swarm_picked_otus',
                          '%s_otus.txt' % input_file_basename)
        alignment_fp = join(self.test_out, 'pynast_aligned_seqs',
                            '%s_rep_set_aligned.fasta' % input_file_basename)
        failures_fp = join(self.test_out, 'pynast_aligned_seqs',
                           '%s_rep_set_failures.fasta' % input_file_basename)
        taxonomy_assignments_fp = join(
            self.test_out, 'rdp_assigned_taxonomy',
            '%s_rep_set_tax_assignments.txt' % input_file_basename)
        otu_table_fp = join(self.test_out, 'otu_table.biom')
        tree_fp = join(self.test_out, 'rep_set.tre')

        self.assertEqual(actual_tree_fp, tree_fp)
        self.assertEqual(actual_otu_table_fp, otu_table_fp)

        # Number of OTUs falls within a range that was manually
        # confirmed.
        otu_map_lines = list(open(otu_map_fp))
        num_otus = len(otu_map_lines)
        otu_map_otu_ids = [o.split()[0] for o in otu_map_lines]
        self.assertEqual(num_otus, 23)

        # all otus get taxonomy assignments
        taxonomy_assignment_lines = list(open(taxonomy_assignments_fp))
        self.assertEqual(len(taxonomy_assignment_lines), num_otus)

        # number of seqs which aligned + num of seqs which failed to
        # align sum to the number of OTUs
        self.assertEqual(
            count_seqs(alignment_fp)[0] + count_seqs(failures_fp)[0], num_otus)

        # number of tips in the tree equals the number of sequences that
        # aligned
        with open(tree_fp) as f:
            tree = TreeNode.from_newick(f)
        self.assertEqual(len(list(tree.tips())), count_seqs(alignment_fp)[0])

        # parse the otu table
        otu_table = load_table(otu_table_fp)
        expected_sample_ids = [
            'f1', 'f2', 'f3', 'f4', 'p1', 'p2', 't1', 't2', 'not16S.1'
        ]
        # sample IDs are as expected
        self.assertItemsEqual(otu_table.ids(), expected_sample_ids)
        # otu ids are as expected
        self.assertItemsEqual(otu_table.ids(axis='observation'),
                              otu_map_otu_ids)
        # number of sequences in the full otu table equals the number of
        # input sequences
        number_seqs_in_otu_table = sum(
            [v.sum() for v in otu_table.iter_data()])
        self.assertEqual(number_seqs_in_otu_table,
                         count_seqs(self.test_data['seqs'][0])[0])

        # Check that the log file is created and has size > 0
        log_fp = glob(join(self.test_out, 'log*.txt'))[0]
        self.assertTrue(getsize(log_fp) > 0)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    verbose = opts.verbose

    input_fp = opts.input_fp
    output_dir = opts.output_dir
    verbose = opts.verbose
    print_only = opts.print_only

    parallel = opts.parallel
    # No longer checking that jobs_to_start > 2, but
    # commenting as we may change our minds about this.
    #if parallel: raise_error_on_parallel_unavailable()

    if opts.parameter_fp:
        try:
            parameter_f = open(opts.parameter_fp, 'U')
        except IOError:
            raise IOError,\
             "Can't open parameters file (%s). Does it exist? Do you have read access?"\
             % opts.parameter_fp
        params = parse_qiime_parameters(parameter_f)
        parameter_f.close()
    else:
        params = parse_qiime_parameters([])
        # empty list returns empty defaultdict for now

    jobs_to_start = opts.jobs_to_start
    default_jobs_to_start = qiime_config['jobs_to_start']
    validate_and_set_jobs_to_start(params, jobs_to_start,
                                   default_jobs_to_start, parallel,
                                   option_parser)

    try:
        makedirs(output_dir)
    except OSError:
        if opts.force:
            pass
        else:
            # Since the analysis can take quite a while, I put this check
            # in to help users avoid overwriting previous output.
            option_parser.error(
                "Output directory already exists. Please choose"
                " a different directory, or force overwrite with -f.")

    if print_only:
        command_handler = print_commands
    else:
        command_handler = call_commands_serially

    if verbose:
        status_update_callback = print_to_stdout
    else:
        status_update_callback = no_status_updates

    run_pick_de_novo_otus(
     input_fp,
     output_dir,
     command_handler=command_handler,
     params=params,
     qiime_config=qiime_config,
     parallel=parallel,\
     status_update_callback=status_update_callback)
Beispiel #7
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    verbose = opts.verbose

    input_fp = opts.input_fp
    output_dir = opts.output_dir
    verbose = opts.verbose
    print_only = opts.print_only

    parallel = opts.parallel
    # No longer checking that jobs_to_start > 2, but
    # commenting as we may change our minds about this.
    #if parallel: raise_error_on_parallel_unavailable()

    if opts.parameter_fp:
        try:
            parameter_f = open(opts.parameter_fp, 'U')
        except IOError:
            raise IOError("Can't open parameters file (%s). Does it exist? Do you have read access?"
                          % opts.parameter_fp)
        params = parse_qiime_parameters(parameter_f)
        parameter_f.close()
    else:
        params = parse_qiime_parameters([])
        # empty list returns empty defaultdict for now

    jobs_to_start = opts.jobs_to_start
    default_jobs_to_start = qiime_config['jobs_to_start']
    validate_and_set_jobs_to_start(params,
                                   jobs_to_start,
                                   default_jobs_to_start,
                                   parallel,
                                   option_parser)

    try:
        makedirs(output_dir)
    except OSError:
        if opts.force:
            pass
        else:
            # Since the analysis can take quite a while, I put this check
            # in to help users avoid overwriting previous output.
            option_parser.error("Output directory already exists. Please choose"
                                " a different directory, or force overwrite with -f.")

    if print_only:
        command_handler = print_commands
    else:
        command_handler = call_commands_serially

    if verbose:
        status_update_callback = print_to_stdout
    else:
        status_update_callback = no_status_updates

    run_pick_de_novo_otus(
        input_fp,
        output_dir,
        command_handler=command_handler,
        params=params,
        qiime_config=qiime_config,
        parallel=parallel,
        status_update_callback=status_update_callback)