Example #1
0
    def setUp(self):
        """setUp method for all tests"""
        # named sequences
        self.rna1 = RnaSequence('UCAGGG', Name='rna1')
        self.rna2 = RnaSequence('YCU-RG', Name='rna2')
        self.rna3 = RnaSequence('CAA-NR', Name='rna3')
        self.model1 = ModelSequence('UCAGGG', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.model2 = ModelSequence('YCU-RG', Name='rna2',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.model3 = ModelSequence('CAA-NR', Name='rna3',\
            Alphabet=RNA.Alphabets.DegenGapped)

        self.aln = Alignment([self.rna1, self.rna2, self.rna3], MolType=RNA)
        self.da = DenseAlignment([self.model1, self.model2, self.model3],\
            MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)

        # seqs no name
        self.nn_rna1 = RnaSequence('UCAGGG')
        self.nn_rna2 = RnaSequence('YCU-RG')
        self.nn_rna3 = RnaSequence('CAA-NR')

        self.nn_model1 = ModelSequence('UCAGGG',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.nn_model2 = ModelSequence('YCU-RG',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.nn_model3 = ModelSequence('CAA-NR',\
            Alphabet=RNA.Alphabets.DegenGapped)

        self.nn_aln = Alignment([self.nn_rna1, self.nn_rna2, self.nn_rna3],\
            MolType=RNA)
        self.nn_da = DenseAlignment([self.nn_model1, self.nn_model2,\
            self.nn_model3], MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)
Example #2
0
def remove_outliers(seqs, num_sigmas, fraction_seqs_for_stats=.95):
    """ remove sequences very different from the majority consensus
    
    given aligned seqs, will calculate a majority consensus (most common
    symbol at each position of the alignment), and average edit distance
    of each seq to that consensus.  any seq whose edit dist is > cutoff 
    (roughly seq_dist > num_sigmas * (average edit dist) ) is removed
    when calculating mean and stddev edit distance, only the best
    fraction_seqs_for_stats are used
    
    seqs must be compatible with DenseAlignment: 
    aln = DenseAlignment(data=seqs, MolType=DNA) is called
    """
    aln = DenseAlignment(data=seqs, MolType=DNA)
    cons = DenseAlignment(data=aln.majorityConsensus(), MolType=DNA)
    diff_mtx = cons.SeqData[:,0] != aln.SeqData
    
    # consider only a fraction of seqs for mean, std
    seq_diffs = diff_mtx.sum(1)
    num_to_consider = round(len(seq_diffs)*fraction_seqs_for_stats)
    seq_diffs_considered_sorted = \
        seq_diffs[seq_diffs.argsort()[:num_to_consider]]
    diff_cutoff = seq_diffs_considered_sorted.mean() + \
      num_sigmas*seq_diffs_considered_sorted.std()
    # mean + e.g.: 4 sigma
    seq_idxs_to_keep = numpy.arange(len(seq_diffs))[seq_diffs <= diff_cutoff]
    
    filtered_aln = aln.getSubAlignment(seq_idxs_to_keep)
    return filtered_aln
Example #3
0
    def test_subset_positions_DenseAlignment(self):
        model1 = ModelSequence('UCG', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        model2 = ModelSequence('YCG', Name='rna2',\
            Alphabet=RNA.Alphabets.DegenGapped)
        model3 = ModelSequence('CAR', Name='rna3',\
            Alphabet=RNA.Alphabets.DegenGapped)
        sub_da = DenseAlignment([model1, model2, model3],\
            MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)

        full_data = array([[0, 1, 2, 3, 3, 3], [15, 1, 0, 4, 12, 3],
                           [1, 2, 2, 4, 10, 12]])
        sub_data = array([[0, 1, 3], [15, 1, 3], [1, 2, 12]])

        # First check some data
        self.assertEqual(self.da.ArraySeqs, full_data)
        self.assertEqual(self.da.ArrayPositions, transpose(full_data))
        self.assertEqual(sub_da.ArraySeqs, sub_data)
        self.assertEqual(sub_da.ArrayPositions, transpose(sub_data))

        obs_sub_da_TP = self.da.takePositions([0, 1, 5])
        obs_sub_da_SA = self.da.getSubAlignment(pos=[0, 1, 5])

        # When using the getSubAlignment method the data is right
        self.assertEqual(obs_sub_da_SA, sub_da)
        self.assertNotEqual(obs_sub_da_SA, self.da)
        self.assertEqual(obs_sub_da_SA.ArraySeqs, sub_data)
        self.assertEqual(obs_sub_da_SA.ArrayPositions, transpose(sub_data))

        # For the takePositions method: Why does this work
        self.assertEqual(obs_sub_da_TP, sub_da)
        self.assertNotEqual(obs_sub_da_TP, self.da)
        # If the data doesn't match?
        self.assertEqual(obs_sub_da_TP.ArraySeqs, sub_data)
        self.assertEqual(obs_sub_da_TP.ArrayPositions, transpose(sub_data))
    def setUp(self):
        """setUp method for all tests"""
        # named sequences
        self.rna1 = RnaSequence('UCAGGG', Name='rna1')
        self.rna2 = RnaSequence('YCU-RG', Name='rna2')
        self.rna3 = RnaSequence('CAA-NR', Name='rna3')
        self.model1 = ModelSequence('UCAGGG', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.model2 = ModelSequence('YCU-RG', Name='rna2',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.model3 = ModelSequence('CAA-NR', Name='rna3',\
            Alphabet=RNA.Alphabets.DegenGapped)

        self.aln = Alignment([self.rna1, self.rna2, self.rna3], MolType=RNA)
        self.da = DenseAlignment([self.model1, self.model2, self.model3],\
            MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)

        # seqs no name
        self.nn_rna1 = RnaSequence('UCAGGG')
        self.nn_rna2 = RnaSequence('YCU-RG')
        self.nn_rna3 = RnaSequence('CAA-NR')

        self.nn_model1 = ModelSequence('UCAGGG',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.nn_model2 = ModelSequence('YCU-RG',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.nn_model3 = ModelSequence('CAA-NR',\
            Alphabet=RNA.Alphabets.DegenGapped)

        self.nn_aln = Alignment([self.nn_rna1, self.nn_rna2, self.nn_rna3],\
            MolType=RNA)
        self.nn_da = DenseAlignment([self.nn_model1, self.nn_model2,\
            self.nn_model3], MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)
Example #5
0
 def test_aln_equality(self):
     # When does something compare equal?
     self.assertEqual(self.da == self.da, True)
     # one sequence less
     other_da1 = DenseAlignment([self.model1, self.model2],\
         MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)
     self.assertEqual(self.da == other_da1, False)
     # seqs in different order -- doesn't matter
     other_da2 = DenseAlignment([self.model1, self.model3, self.model2],\
         MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)
     self.assertEqual(self.da == other_da2, True)
     # seqs in different encoding -- doesn't matter, only looks at data
     other_da3 = DenseAlignment([self.model1, self.model2, self.model3])
     # Should this compare False even though the data is exactly the same?
     # The MolType is different...
     self.assertEqual(self.da == other_da3, True)
     assert alltrue(map(alltrue, self.da.ArraySeqs == other_da3.ArraySeqs))
Example #6
0
    def test_DenseAlignment_without_moltype(self):
        """Expect MolType to be picked up from the sequences."""

        m1 = ModelSequence('UCAG',Alphabet=RNA.Alphabets.DegenGapped,\
            Name='rna1')
        m2 = ModelSequence('CCCR',Alphabet=RNA.Alphabets.DegenGapped,\
            Name='rna2')
        da = DenseAlignment([m1, m2])
        exp_lines = ['>rna1', 'UCAG', '>rna2', 'CCCR']
        self.assertEqual(str(da), '\n'.join(exp_lines) + '\n')
    def setUp(self):
        """ Initialize some variables for the tests """
        self.canonical_abbrevs = 'ACDEFGHIKLMNPQRSTVWY'
        self.ambiguous_abbrevs = 'BXZ'

        self.all_to_a = [('A',self.canonical_abbrevs+\
            self.ambiguous_abbrevs)]
        self.charge_2 = alphabets['charge_2']
        self.hydropathy_3 = alphabets['hydropathy_3']
        self.orig = alphabets['orig']
        self.aln = DenseAlignment(\
            data={'1':'CDDFBXZ', '2':'CDD-BXZ', '3':'AAAASS-'})
        self.aln2 = LoadSeqs(\
            data={'1':'CDDFBXZ', '2':'CDD-BXZ', '3':'AAAASS-'})
    def test_recode_dense_alignment(self):
        """recode_dense_alignment: recode alignment to charge_2 alpha works
        """
        expected_c2 = DenseAlignment(data=\
            {'1':'AKKAKAK','2':'AKK-KAK','3':'AAAAAA-'})
        expected_h3 = DenseAlignment(data=\
            {'1':'PRRPRPR','2':'PRR-RPR','3':'PPPPYY-'})
        expected_aa = DenseAlignment(data=\
            {'1':'AAAAAAA','2':'AAA-AAA','3':'AAAAAA-'})

        # provided with alphabet_id
        actual = recode_dense_alignment(self.aln, alphabet_id='charge_2')
        self.assertEqual(actual, expected_c2)
        # provided with alphabet_def
        actual = recode_dense_alignment(self.aln, alphabet_def=self.charge_2)
        self.assertEqual(actual, expected_c2)

        # different alphabet
        actual = recode_dense_alignment(self.aln, alphabet_id='hydropathy_3')
        self.assertEqual(actual, expected_h3)
        actual = recode_dense_alignment(self.aln,\
          alphabet_def=self.hydropathy_3)
        self.assertEqual(actual, expected_h3)

        # different alphabet
        actual = recode_dense_alignment(self.aln, alphabet_def=self.all_to_a)
        self.assertEqual(actual, expected_aa)

        # original charactars which aren't remapped are let in original state
        actual = recode_dense_alignment(self.aln, alphabet_def=[('a', 'b')])
        self.assertEqual(actual, self.aln)

        # non-alphabetic character mapped same as alphabetic characters
        actual = recode_dense_alignment(self.aln, alphabet_def=[('.', '-')])
        expected = DenseAlignment(\
         data={'1':'CDDFBXZ', '2':'CDD.BXZ', '3':'AAAASS.'})
        self.assertEqual(actual, expected)
Example #9
0
    def test_subset_seqs_DenseAlignment(self):
        model1 = ModelSequence('UCG', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        model2 = ModelSequence('YCG', Name='rna2',\
            Alphabet=RNA.Alphabets.DegenGapped)
        model3 = ModelSequence('CAR', Name='rna3',\
            Alphabet=RNA.Alphabets.DegenGapped)
        sub_da = DenseAlignment([model1, model2, model3],\
            MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)

        # takeSeqs by name should have the same effect as
        # getSubAlignment by seq idx?
        obs_sub_da_TS = self.da.takeSeqs(['rna1'])
        obs_sub_da_SA = self.da.getSubAlignment(seqs=[0])

        # These two are now the same. Fixed mapping of key to char array.
        self.assertEqual(obs_sub_da_TS, obs_sub_da_SA)
        self.assertEqual(str(obs_sub_da_TS), str(obs_sub_da_SA))
Example #10
0
def recode_dense_alignment(aln, alphabet_id=None, alphabet_def=None):
    """Return new DenseAlignment recoded in the provided reduced-state alphabet
    
        aln: the DenseAlignment object to be recoded
        alphabet_id: string identifying an alphabet in 
            cogent.util.recode_alignment.alphabets. 
            (See cogent.util.recode_alignment.alphabets.keys()
            for valid alphabet_ids.)
        alphabet_def: list of two-element tuples where first element is 
            the new alphabet character and the second elements is an iterable 
            object containing the old alphabet chars which should be mapped to
            the new char. 
            e.g., [('A','CVILFMWAGSTPYH'),('B','QNDERKBZ')] 
            (See cogent.util.recode_alignment.alphabets.values() 
            for more examples.)  
         
        Note: either alphabet_id OR alphabet_def must be passed. Either
            provide the alphabet, or have it is looked up. If both are provided
            the alphabet_id is ignored.
         
    """

    # Construct a dict mapping from UInt8s in alignment to their
    # associated characters. This dict is then used for looking
    # up chars in the new and old alphabets.
    byte_map = dict(zip(aln.Alphabet, range(len(aln.Alphabet))))

    # Construct a dict mapping old characters to new characters.
    alphabet_map = build_alphabet_map(alphabet_id=alphabet_id,\
        alphabet_def=alphabet_def)

    # Create the recoded version of seqs.Alphabet
    new_indices = range(len(aln.Alphabet))
    for old, new in alphabet_map.items():
        new_indices[byte_map[old]] = byte_map[new]

    # Map the old alphabet onto the new alphabet. Note: characters that
    # that are not mapped are ignored. Returns a new DenseAlignment.
    return DenseAlignment(take(new_indices,aln.ArraySeqs).transpose(),\
        aln.Names[:],MolType=aln.MolType)
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    parameters = {}

    # get the tree insertion method to use
    module = opts.insertion_method

    # create output directory
    output_dir = opts.output_dir
    create_dir(output_dir)

    # list of tree insertion methods
    tree_insertion_module_names = \
        {'raxml_v730': brokit.raxml_v730,
         'parsinsert': brokit.parsinsert,
         'pplacer': brokit.pplacer}

    # load input sequences and convert to phylip since the tools require
    # the query sequences to phylip-compliant names
    load_aln = parse_fasta(open(opts.input_fasta_fp, 'U'))
    aln = DenseAlignment(load_aln)
    seqs, align_map = aln.toPhylip()

    if opts.method_params_fp:
        param_dict = parse_qiime_parameters(open(opts.method_params_fp, 'U'))

    if module == 'raxml_v730':
        # load the reference sequences
        load_ref_aln = \
            DenseAlignment(parse_fasta(open(opts.refseq_fp, 'U')))

        # combine and load the reference plus query
        combined_aln = parse_fasta(StringIO(load_ref_aln.toFasta() +
                                                   '\n' + aln.toFasta()))
        # overwrite the alignment map
        aln = DenseAlignment(combined_aln)
        seqs, align_map = aln.toPhylip()

        try:
            parameters = param_dict['raxml']
        except:
            parameters = {}

        tree = convert_tree_tips(align_map, opts.starting_tree_fp)

        # write out the tree with phylip labels
        updated_tree_fp = join(output_dir,
                               '%s_phylip_named_tree.tre' % (module))
        write_updated_tree_file(updated_tree_fp, tree)

        # set the primary parameters for raxml
        parameters['-w'] = abspath(output_dir) + '/'
        parameters["-n"] = split(splitext(get_tmp_filename())[0])[-1]
        parameters["-t"] = updated_tree_fp

        if "-f" not in parameters:
            parameters["-f"] = 'v'
        if "-m" not in parameters:
            parameters["-m"] = 'GTRGAMMA'

    elif module == 'pplacer':
        try:
            parameters = param_dict['pplacer']
        except:
            parameters = {}

        # make sure stats file is passed
        if not opts.stats_fp:
            raise IOError(
                'When using pplacer, the RAxML produced info file is required.')

        # set the primary parameters for pplacer - allow for user-defined
        parameters['--out-dir'] = abspath(output_dir) + '/'
        parameters["-t"] = opts.starting_tree_fp
        parameters['-r'] = opts.refseq_fp
        parameters['-s'] = opts.stats_fp

    elif module == 'parsinsert':
        try:
            parameters = param_dict['parsinsert']
        except:
            parameters = {}

        # define log fp
        log_fp = join(output_dir, 'parsinsert.log')

        # define tax assignment values fp
        tax_assign_fp = join(output_dir, 'parsinsert_assignments.log')
        parameters["-l"] = log_fp
        parameters["-o"] = tax_assign_fp
        parameters["-s"] = opts.refseq_fp
        parameters["-t"] = opts.starting_tree_fp

    # call the module and return a tree object
    result = \
        tree_insertion_module_names[module].insert_sequences_into_tree(seqs,
                                                                       moltype=DNA, params=parameters)

    result_tree = strip_and_rename_unwanted_labels_from_tree(align_map, result)

    # write out the resulting tree
    final_tree = join(output_dir, '%s_final_placement.tre' % (module))
    write_updated_tree_file(final_tree, result)
Example #12
0
class AllTests(TestCase):
    def setUp(self):
        """setUp method for all tests"""
        # named sequences
        self.rna1 = RnaSequence('UCAGGG', Name='rna1')
        self.rna2 = RnaSequence('YCU-RG', Name='rna2')
        self.rna3 = RnaSequence('CAA-NR', Name='rna3')
        self.model1 = ModelSequence('UCAGGG', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.model2 = ModelSequence('YCU-RG', Name='rna2',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.model3 = ModelSequence('CAA-NR', Name='rna3',\
            Alphabet=RNA.Alphabets.DegenGapped)

        self.aln = Alignment([self.rna1, self.rna2, self.rna3], MolType=RNA)
        self.da = DenseAlignment([self.model1, self.model2, self.model3],\
            MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)

        # seqs no name
        self.nn_rna1 = RnaSequence('UCAGGG')
        self.nn_rna2 = RnaSequence('YCU-RG')
        self.nn_rna3 = RnaSequence('CAA-NR')

        self.nn_model1 = ModelSequence('UCAGGG',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.nn_model2 = ModelSequence('YCU-RG',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.nn_model3 = ModelSequence('CAA-NR',\
            Alphabet=RNA.Alphabets.DegenGapped)

        self.nn_aln = Alignment([self.nn_rna1, self.nn_rna2, self.nn_rna3],\
            MolType=RNA)
        self.nn_da = DenseAlignment([self.nn_model1, self.nn_model2,\
            self.nn_model3], MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)

    def test_printing_named_seqs(self):
        """Printing named seqs should work the same on Aln and DenseAln"""
        #Note: the newline trailing each sequence is intentional, because
        #we want each FASTA-format record to be separated.
        exp_lines_general = [
            '>rna1', 'UCAGGG', '>rna2', 'YCU-RG', '>rna3', 'CAA-NR'
        ]
        self.assertEqual(str(self.aln), '\n'.join(exp_lines_general) + '\n')
        self.assertEqual(str(self.da), '\n'.join(exp_lines_general) + '\n')

    def test_printing_unnamed_seqs(self):
        """Printing unnamed sequences should work the same on Aln and DenseAln
        """
        exp_lines_gen = [
            '>seq_0', 'UCAGGG', '>seq_1', 'YCU-RG', '>seq_2', 'CAA-NR\n'
        ]
        self.assertEqual(str(self.nn_aln), '\n'.join(exp_lines_gen))
        self.assertEqual(str(self.nn_da), '\n'.join(exp_lines_gen))

    def test_DenseAlignment_without_moltype(self):
        """Expect MolType to be picked up from the sequences."""

        m1 = ModelSequence('UCAG',Alphabet=RNA.Alphabets.DegenGapped,\
            Name='rna1')
        m2 = ModelSequence('CCCR',Alphabet=RNA.Alphabets.DegenGapped,\
            Name='rna2')
        da = DenseAlignment([m1, m2])
        exp_lines = ['>rna1', 'UCAG', '>rna2', 'CCCR']
        self.assertEqual(str(da), '\n'.join(exp_lines) + '\n')

    def test_names(self):
        # Should both alignments handle names the same way?
        self.assertEqual(self.aln.Names, ['rna1', 'rna2', 'rna3'])
        self.assertEqual(self.da.Names, ['rna1', 'rna2', 'rna3'])
        # On unnamed sequences the behavior is now the same.
        self.assertEqual(self.nn_aln.Names, ['seq_0', 'seq_1', 'seq_2'])
        self.assertEqual(self.nn_da.Names, ['seq_0', 'seq_1', 'seq_2'])

    def test_seqFreqs(self):
        """seqFreqs should work the same on Alignment and DenseAlignment"""
        # Used alphabet: ('U', 'C', 'A', 'G', '-', 'B', 'D', 'H',\
        # 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y')
        exp = [[1,1,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0],\
            [1,1,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0],\
            [0,1,2,0,1,0,0,0,0,0,1,0,1,0,0,0,0]]
        # This works
        self.assertEqual(self.da.getSeqFreqs().Data, exp)
        # This used to raise an error, but now works
        self.assertEqual(self.aln.getSeqFreqs().Data, exp)

    def test_subset_positions_DenseAlignment(self):
        model1 = ModelSequence('UCG', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        model2 = ModelSequence('YCG', Name='rna2',\
            Alphabet=RNA.Alphabets.DegenGapped)
        model3 = ModelSequence('CAR', Name='rna3',\
            Alphabet=RNA.Alphabets.DegenGapped)
        sub_da = DenseAlignment([model1, model2, model3],\
            MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)

        full_data = array([[0, 1, 2, 3, 3, 3], [15, 1, 0, 4, 12, 3],
                           [1, 2, 2, 4, 10, 12]])
        sub_data = array([[0, 1, 3], [15, 1, 3], [1, 2, 12]])

        # First check some data
        self.assertEqual(self.da.ArraySeqs, full_data)
        self.assertEqual(self.da.ArrayPositions, transpose(full_data))
        self.assertEqual(sub_da.ArraySeqs, sub_data)
        self.assertEqual(sub_da.ArrayPositions, transpose(sub_data))

        obs_sub_da_TP = self.da.takePositions([0, 1, 5])
        obs_sub_da_SA = self.da.getSubAlignment(pos=[0, 1, 5])

        # When using the getSubAlignment method the data is right
        self.assertEqual(obs_sub_da_SA, sub_da)
        self.assertNotEqual(obs_sub_da_SA, self.da)
        self.assertEqual(obs_sub_da_SA.ArraySeqs, sub_data)
        self.assertEqual(obs_sub_da_SA.ArrayPositions, transpose(sub_data))

        # For the takePositions method: Why does this work
        self.assertEqual(obs_sub_da_TP, sub_da)
        self.assertNotEqual(obs_sub_da_TP, self.da)
        # If the data doesn't match?
        self.assertEqual(obs_sub_da_TP.ArraySeqs, sub_data)
        self.assertEqual(obs_sub_da_TP.ArrayPositions, transpose(sub_data))
        # Shouldn't the __eq__ method check the data at least?

    def test_subset_positions_Alignment(self):
        rna1 = RnaSequence('UCG', Name='rna1')
        rna2 = RnaSequence('YCG', Name='rna2')
        rna3 = RnaSequence('CAR', Name='rna3')

        sub_aln = Alignment([rna1, rna2, rna3], MolType=RNA)

        obs_sub_aln = self.aln.takePositions([0, 1, 5])
        self.assertEqual(obs_sub_aln, sub_aln)
        self.assertNotEqual(obs_sub_aln, self.aln)
        # string representations should be the same. This fails right
        # now, because sequence order is not maintained. See separate test.
        self.assertEqual(str(obs_sub_aln), str(sub_aln))

    def test_takePositions_sequence_order(self):
        """Alignment takePositions should maintain seq order"""
        #This works
        self.assertEqual(self.da.Names, ['rna1', 'rna2', 'rna3'])
        sub_da = self.da.getSubAlignment(pos=[0, 1, 5])
        self.assertEqual(sub_da.Names, ['rna1', 'rna2', 'rna3'])
        # seq order not maintained in Alignment
        self.assertEqual(self.aln.Names, ['rna1', 'rna2', 'rna3'])
        sub_aln = self.aln.takePositions([0, 1, 5])
        self.assertEqual(sub_aln.Names, ['rna1', 'rna2', 'rna3'])

    def test_subset_seqs_Alignment(self):
        rna1 = RnaSequence('UCG', Name='rna1')
        rna2 = RnaSequence('YCG', Name='rna2')
        rna3 = RnaSequence('CAR', Name='rna3')

        sub_aln = Alignment([rna2, rna3], MolType=RNA)
        aln = Alignment([rna1, rna2, rna3], MolType=RNA)
        obs_sub_aln = aln.takeSeqs(['rna2', 'rna3'])

        self.assertEqual(obs_sub_aln, sub_aln)
        self.assertEqual(str(obs_sub_aln), str(sub_aln))

        # Selected sequences should be in specified order?
        obs_sub_aln_1 = self.aln.takeSeqs(['rna3', 'rna2'])
        obs_sub_aln_2 = self.aln.takeSeqs(['rna2', 'rna3'])
        self.assertNotEqual(str(obs_sub_aln_1), str(obs_sub_aln_2))

    def test_subset_seqs_DenseAlignment(self):
        model1 = ModelSequence('UCG', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        model2 = ModelSequence('YCG', Name='rna2',\
            Alphabet=RNA.Alphabets.DegenGapped)
        model3 = ModelSequence('CAR', Name='rna3',\
            Alphabet=RNA.Alphabets.DegenGapped)
        sub_da = DenseAlignment([model1, model2, model3],\
            MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)

        # takeSeqs by name should have the same effect as
        # getSubAlignment by seq idx?
        obs_sub_da_TS = self.da.takeSeqs(['rna1'])
        obs_sub_da_SA = self.da.getSubAlignment(seqs=[0])

        # These two are now the same. Fixed mapping of key to char array.
        self.assertEqual(obs_sub_da_TS, obs_sub_da_SA)
        self.assertEqual(str(obs_sub_da_TS), str(obs_sub_da_SA))

    def test_aln_equality(self):
        # When does something compare equal?
        self.assertEqual(self.da == self.da, True)
        # one sequence less
        other_da1 = DenseAlignment([self.model1, self.model2],\
            MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)
        self.assertEqual(self.da == other_da1, False)
        # seqs in different order -- doesn't matter
        other_da2 = DenseAlignment([self.model1, self.model3, self.model2],\
            MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)
        self.assertEqual(self.da == other_da2, True)
        # seqs in different encoding -- doesn't matter, only looks at data
        other_da3 = DenseAlignment([self.model1, self.model2, self.model3])
        # Should this compare False even though the data is exactly the same?
        # The MolType is different...
        self.assertEqual(self.da == other_da3, True)
        assert alltrue(
            list(map(alltrue, self.da.ArraySeqs == other_da3.ArraySeqs)))

    def test_seq_equality(self):
        model1 = ModelSequence('UCG', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        model2 = ModelSequence('UCG', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        # Shouldn't the above two sequences be equal?
        self.assertEqual(model1, model2)
        # string comparison is True
        self.assertEqual(str(model1), str(model2))

    def test_seq_ungapping(self):
        rna1 = RnaSequence('U-C-A-G-', Name='rna1')
        model1 = ModelSequence('U-C-A-G-', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)

        self.assertEqual(rna1, 'U-C-A-G-')
        self.assertEqual(rna1.degap(), 'UCAG')

        # check is produces the right string from the beginning
        self.assertEqual(str(model1), 'U-C-A-G-')
        self.assertEqual(model1._data, [0, 4, 1, 4, 2, 4, 3, 4])
        # ModelSequence should maybe have the same degap method as normal Seq
        self.assertEqual(str(model1.degap()), 'UCAG')

    def test_the_rest_of_ModelSequence(self):
        """The class ModelSequence has 14 methods, but only 2 unittests.
        You might want to add some tests there..."""
        #note: mostly these are tested in derived classes, for convenience.
        pass
class AllTests(TestCase):

    def setUp(self):
        """setUp method for all tests"""
        # named sequences
        self.rna1 = RnaSequence('UCAGGG', Name='rna1')
        self.rna2 = RnaSequence('YCU-RG', Name='rna2')
        self.rna3 = RnaSequence('CAA-NR', Name='rna3')
        self.model1 = ModelSequence('UCAGGG', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.model2 = ModelSequence('YCU-RG', Name='rna2',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.model3 = ModelSequence('CAA-NR', Name='rna3',\
            Alphabet=RNA.Alphabets.DegenGapped)

        self.aln = Alignment([self.rna1, self.rna2, self.rna3], MolType=RNA)
        self.da = DenseAlignment([self.model1, self.model2, self.model3],\
            MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)

        # seqs no name
        self.nn_rna1 = RnaSequence('UCAGGG')
        self.nn_rna2 = RnaSequence('YCU-RG')
        self.nn_rna3 = RnaSequence('CAA-NR')

        self.nn_model1 = ModelSequence('UCAGGG',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.nn_model2 = ModelSequence('YCU-RG',\
            Alphabet=RNA.Alphabets.DegenGapped)
        self.nn_model3 = ModelSequence('CAA-NR',\
            Alphabet=RNA.Alphabets.DegenGapped)

        self.nn_aln = Alignment([self.nn_rna1, self.nn_rna2, self.nn_rna3],\
            MolType=RNA)
        self.nn_da = DenseAlignment([self.nn_model1, self.nn_model2,\
            self.nn_model3], MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)

    def test_printing_named_seqs(self):
        """Printing named seqs should work the same on Aln and DenseAln"""
        #Note: the newline trailing each sequence is intentional, because
        #we want each FASTA-format record to be separated.
        exp_lines_general = ['>rna1','UCAGGG','>rna2','YCU-RG','>rna3','CAA-NR']
        self.assertEqual(str(self.aln), '\n'.join(exp_lines_general) + '\n')
        self.assertEqual(str(self.da), '\n'.join(exp_lines_general) + '\n')

    def test_printing_unnamed_seqs(self):
        """Printing unnamed sequences should work the same on Aln and DenseAln
        """
        exp_lines_gen = ['>seq_0','UCAGGG','>seq_1','YCU-RG','>seq_2','CAA-NR\n']
        self.assertEqual(str(self.nn_aln),'\n'.join(exp_lines_gen))
        self.assertEqual(str(self.nn_da),'\n'.join(exp_lines_gen))

    def test_DenseAlignment_without_moltype(self):
        """Expect MolType to be picked up from the sequences."""

        m1 = ModelSequence('UCAG',Alphabet=RNA.Alphabets.DegenGapped,\
            Name='rna1')
        m2 = ModelSequence('CCCR',Alphabet=RNA.Alphabets.DegenGapped,\
            Name='rna2')
        da = DenseAlignment([m1, m2])
        exp_lines = ['>rna1','UCAG','>rna2','CCCR']
        self.assertEqual(str(da), '\n'.join(exp_lines) + '\n')

    def test_names(self):
        # Should both alignments handle names the same way?
        self.assertEqual(self.aln.Names, ['rna1','rna2','rna3'])
        self.assertEqual(self.da.Names, ['rna1','rna2','rna3'])
        # On unnamed sequences the behavior is now the same.
        self.assertEqual(self.nn_aln.Names, ['seq_0','seq_1','seq_2'])
        self.assertEqual(self.nn_da.Names, ['seq_0','seq_1','seq_2'])
    
    def test_seqFreqs(self):
        """seqFreqs should work the same on Alignment and DenseAlignment"""
        # Used alphabet: ('U', 'C', 'A', 'G', '-', 'B', 'D', 'H',\
        # 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y')
        exp = [[1,1,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0],\
            [1,1,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0],\
            [0,1,2,0,1,0,0,0,0,0,1,0,1,0,0,0,0]]
        # This works
        self.assertEqual(self.da.getSeqFreqs().Data, exp)
        # This used to raise an error, but now works
        self.assertEqual(self.aln.getSeqFreqs().Data, exp)

    def test_subset_positions_DenseAlignment(self):
        model1 = ModelSequence('UCG', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        model2 = ModelSequence('YCG', Name='rna2',\
            Alphabet=RNA.Alphabets.DegenGapped)
        model3 = ModelSequence('CAR', Name='rna3',\
            Alphabet=RNA.Alphabets.DegenGapped)
        sub_da = DenseAlignment([model1, model2, model3],\
            MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)

        full_data = array([[0,1,2,3,3,3],[15,1,0,4,12,3],[1,2,2,4,10,12]])
        sub_data = array([[0,1,3],[15,1,3],[1,2,12]])
        
        # First check some data
        self.assertEqual(self.da.ArraySeqs, full_data)
        self.assertEqual(self.da.ArrayPositions, transpose(full_data))
        self.assertEqual(sub_da.ArraySeqs, sub_data)
        self.assertEqual(sub_da.ArrayPositions, transpose(sub_data))
        
        obs_sub_da_TP = self.da.takePositions([0,1,5])
        obs_sub_da_SA = self.da.getSubAlignment(pos=[0,1,5])
        
        # When using the getSubAlignment method the data is right 
        self.assertEqual(obs_sub_da_SA, sub_da)
        self.failIfEqual(obs_sub_da_SA, self.da)
        self.assertEqual(obs_sub_da_SA.ArraySeqs, sub_data)
        self.assertEqual(obs_sub_da_SA.ArrayPositions, transpose(sub_data))

        # For the takePositions method: Why does this work
        self.assertEqual(obs_sub_da_TP, sub_da)
        self.failIfEqual(obs_sub_da_TP, self.da)
        # If the data doesn't match?
        self.assertEqual(obs_sub_da_TP.ArraySeqs, sub_data)
        self.assertEqual(obs_sub_da_TP.ArrayPositions, transpose(sub_data))
        # Shouldn't the __eq__ method check the data at least?
        
    def test_subset_positions_Alignment(self):
        rna1 = RnaSequence('UCG', Name='rna1')
        rna2 = RnaSequence('YCG', Name='rna2')
        rna3 = RnaSequence('CAR', Name='rna3')
        
        sub_aln = Alignment([rna1, rna2, rna3], MolType=RNA)

        obs_sub_aln = self.aln.takePositions([0,1,5])
        self.assertEqual(obs_sub_aln, sub_aln)
        self.failIfEqual(obs_sub_aln, self.aln)
        # string representations should be the same. This fails right
        # now, because sequence order is not maintained. See separate test.
        self.assertEqual(str(obs_sub_aln), str(sub_aln))
        
    def test_takePositions_sequence_order(self):
        """Alignment takePositions should maintain seq order"""
        #This works        
        self.assertEqual(self.da.Names,['rna1','rna2','rna3'])
        sub_da = self.da.getSubAlignment(pos=[0,1,5])
        self.assertEqual(sub_da.Names,['rna1','rna2','rna3'])
        # seq order not maintained in Alignment
        self.assertEqual(self.aln.Names,['rna1','rna2','rna3']) 
        sub_aln = self.aln.takePositions([0,1,5])
        self.assertEqual(sub_aln.Names,['rna1','rna2','rna3'])

    def test_subset_seqs_Alignment(self):
        rna1 = RnaSequence('UCG', Name='rna1')
        rna2 = RnaSequence('YCG', Name='rna2')
        rna3 = RnaSequence('CAR', Name='rna3')

        sub_aln = Alignment([rna2, rna3], MolType=RNA)
        aln = Alignment([rna1, rna2, rna3], MolType=RNA)
        obs_sub_aln = aln.takeSeqs(['rna2','rna3'])
        
        self.assertEqual(obs_sub_aln, sub_aln)
        self.assertEqual(str(obs_sub_aln), str(sub_aln))

        # Selected sequences should be in specified order?
        obs_sub_aln_1 = self.aln.takeSeqs(['rna3','rna2'])
        obs_sub_aln_2 = self.aln.takeSeqs(['rna2','rna3'])
        self.failIfEqual(str(obs_sub_aln_1), str(obs_sub_aln_2))
    
    def test_subset_seqs_DenseAlignment(self):
        model1 = ModelSequence('UCG', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        model2 = ModelSequence('YCG', Name='rna2',\
            Alphabet=RNA.Alphabets.DegenGapped)
        model3 = ModelSequence('CAR', Name='rna3',\
            Alphabet=RNA.Alphabets.DegenGapped)
        sub_da = DenseAlignment([model1, model2, model3],\
            MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)
       
        # takeSeqs by name should have the same effect as
        # getSubAlignment by seq idx?
        obs_sub_da_TS = self.da.takeSeqs(['rna1'])
        obs_sub_da_SA = self.da.getSubAlignment(seqs=[0])

        # These two are now the same. Fixed mapping of key to char array.
        self.assertEqual(obs_sub_da_TS, obs_sub_da_SA)
        self.assertEqual(str(obs_sub_da_TS), str(obs_sub_da_SA))

    def test_aln_equality(self):
        # When does something compare equal?
        self.assertEqual(self.da == self.da, True)
        # one sequence less
        other_da1 = DenseAlignment([self.model1, self.model2],\
            MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)
        self.assertEqual(self.da == other_da1, False)
        # seqs in different order -- doesn't matter
        other_da2 = DenseAlignment([self.model1, self.model3, self.model2],\
            MolType=RNA, Alphabet=RNA.Alphabets.DegenGapped)
        self.assertEqual(self.da == other_da2, True)
        # seqs in different encoding -- doesn't matter, only looks at data
        other_da3 = DenseAlignment([self.model1, self.model2, self.model3])
        # Should this compare False even though the data is exactly the same?
        # The MolType is different...
        self.assertEqual(self.da == other_da3, True) 
        assert alltrue(map(alltrue,self.da.ArraySeqs == other_da3.ArraySeqs))

    def test_seq_equality(self):
        model1 = ModelSequence('UCG', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        model2 = ModelSequence('UCG', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        # Shouldn't the above two sequences be equal?
        self.assertEqual(model1, model2)
        # string comparison is True
        self.assertEqual(str(model1), str(model2))

    def test_seq_ungapping(self):
        rna1 = RnaSequence('U-C-A-G-', Name='rna1')
        model1 = ModelSequence('U-C-A-G-', Name='rna1',\
            Alphabet=RNA.Alphabets.DegenGapped)
        
        self.assertEqual(rna1, 'U-C-A-G-')
        self.assertEqual(rna1.degap(), 'UCAG')

        # check is produces the right string from the beginning
        self.assertEqual(str(model1), 'U-C-A-G-')
        self.assertEqual(model1._data, [0,4,1,4,2,4,3,4])
        # ModelSequence should maybe have the same degap method as normal Seq
        self.assertEqual(str(model1.degap()), 'UCAG')

    def test_the_rest_of_ModelSequence(self):
        """The class ModelSequence has 14 methods, but only 2 unittests.
        You might want to add some tests there..."""
        #note: mostly these are tested in derived classes, for convenience.
        pass
Example #14
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    parameters = {}

    # get the tree insertion method to use
    module = opts.insertion_method

    # create output directory
    output_dir = opts.output_dir
    create_dir(output_dir)

    # list of tree insertion methods
    tree_insertion_module_names = \
                {'raxml_v730':cogent.app.raxml_v730,
                 'parsinsert':cogent.app.parsinsert,
                 'pplacer':cogent.app.pplacer}

    # load input sequences and convert to phylip since the tools require
    # the query sequences to phylip-compliant names
    load_aln = MinimalFastaParser(open(opts.input_fasta_fp, 'U'))
    aln = DenseAlignment(load_aln)
    seqs, align_map = aln.toPhylip()

    if opts.method_params_fp:
        param_dict = parse_qiime_parameters(open(opts.method_params_fp, 'U'))

    if module == 'raxml_v730':
        # load the reference sequences
        load_ref_aln = \
            DenseAlignment(MinimalFastaParser(open(opts.refseq_fp,'U')))

        # combine and load the reference plus query
        combined_aln = MinimalFastaParser(StringIO(load_ref_aln.toFasta() + \
                                                   '\n' + aln.toFasta()))
        # overwrite the alignment map
        aln = DenseAlignment(combined_aln)
        seqs, align_map = aln.toPhylip()

        try:
            parameters = param_dict['raxml']
        except:
            parameters = {}

        tree = convert_tree_tips(align_map, opts.starting_tree_fp)

        # write out the tree with phylip labels
        updated_tree_fp = join(output_dir, \
                                '%s_phylip_named_tree.tre' % (module))
        write_updated_tree_file(updated_tree_fp, tree)

        # set the primary parameters for raxml
        parameters['-w'] = abspath(output_dir) + '/'
        parameters["-n"] = split(splitext(get_tmp_filename())[0])[-1]
        parameters["-t"] = updated_tree_fp

        if "-f" not in parameters:
            parameters["-f"] = 'v'
        if "-m" not in parameters:
            parameters["-m"] = 'GTRGAMMA'

    elif module == 'pplacer':
        try:
            parameters = param_dict['pplacer']
        except:
            parameters = {}

        # make sure stats file is passed
        if not opts.stats_fp:
            raise IOError, \
                'When using pplacer, the RAxML produced info file is required.'

        # set the primary parameters for pplacer - allow for user-defined
        parameters['--out-dir'] = abspath(output_dir) + '/'
        parameters["-t"] = opts.starting_tree_fp
        parameters['-r'] = opts.refseq_fp
        parameters['-s'] = opts.stats_fp

    elif module == 'parsinsert':
        try:
            parameters = param_dict['parsinsert']
        except:
            parameters = {}

        # define log fp
        log_fp = join(output_dir, 'parsinsert.log')

        # define tax assignment values fp
        tax_assign_fp = join(output_dir, 'parsinsert_assignments.log')
        parameters["-l"] = log_fp
        parameters["-o"] = tax_assign_fp
        parameters["-s"] = opts.refseq_fp
        parameters["-t"] = opts.starting_tree_fp

    # call the module and return a tree object
    result = \
        tree_insertion_module_names[module].insert_sequences_into_tree(seqs,
                                                moltype=DNA, params=parameters)

    result_tree = strip_and_rename_unwanted_labels_from_tree(align_map, result)

    # write out the resulting tree
    final_tree = join(output_dir, '%s_final_placement.tre' % (module))
    write_updated_tree_file(final_tree, result)
Example #15
0
def VOR(alignment, n=1000, force_monte_carlo=False, mc_threshold=1000):
    """Returns sequence weights according to the Voronoi weighting method.

    alignment: Alignment object
    n: sampling size (in case monte carlo is used)
    force_monte_carlo: generate pseudo seqs with monte carlo always (even
        if there's only a small number of possible unique pseudo seqs
    mc_threshold: threshold of when to use the monte carlo sampling method
        if the number of possible pseudo seqs exceeds this threshold monte
        carlo is used.

    VOR differs from VA in the set of sequences against which it's comparing
    all the sequences in the alignment. In addition to the sequences in the 
    alignment itself, it uses a set of pseudo sequences.
    
    Generating discrete random sequences: 
    A discrete random sequence is generated by choosing with equal
    likelihood at each position one of the residues observed at that position 
    in the alighment. An occurrence of once in the alignment column is 
    sufficient to make the residue type an option. Note: you're choosing 
    with equal likelihood from each of the observed residues (independent 
    of their frequency at that position). In earlier versions of the algorithm 
    the characters were chosen either at the frequency with which they occur 
    at a position or at the frequency with which they occur in the database. 
    Both trials were unsuccesful, because they deviate from random sampling 
    (see Sibbald & Argos 1990).

    Depending on the number of possible pseudo sequences, all of them are 
    used or a random sample is taken (monte carlo).

    Example:
    Alignment: AA, AA, BB
        AA      AA      BB
    AA  0 (.5)  0 (.5)  2
    AB  1 (1/3) 1 (1/3) 1 (1/3)
    BA  1 (1/3) 1 (1/3) 1 (1/3)
    BB  2       2       0 (1)
    -----------------------------
    total 7/6     7/6     10/6
    norm  .291    .291    .418

    For a bigger example with more pseudo sequences, see Henikoff 1994

    I tried the described optimization (pre-calculate the distance to the
    closest sequence). I doesn't have an advantage over the original method.
    """

    MC_THRESHOLD = mc_threshold

    #decide on sampling method
    if force_monte_carlo or number_of_pseudo_seqs(alignment) > MC_THRESHOLD:
        sampling_method = pseudo_seqs_monte_carlo
    else:
        sampling_method = pseudo_seqs_exact
    #change sequences into arrays
    aln_array = DenseAlignment(alignment, MolType=BYTES)
    weights = zeros(len(aln_array.Names), Float64)
    #calc distances for each pseudo seq
    rows = [array(seq, 'c') for seq in map(str, aln_array.Seqs)]
    for seq in sampling_method(aln_array, n=n):
        seq = array(seq, 'c')
        temp = [hamming_distance(row, seq) for row in rows]
        votes = row_to_vote(array(temp))  #change distances to votes
        weights += votes  #add to previous weights
    weight_dict = Weights(dict(zip(aln_array.Names, weights)))
    weight_dict.normalize()  #normalize

    return weight_dict
Example #16
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    parameters = {}

    # get the tree insertion method to use
    module = opts.insertion_method

    # create output directory
    output_dir = opts.output_dir
    create_dir(output_dir)

    # list of tree insertion methods
    tree_insertion_module_names = {
        "raxml_v730": qiime.pycogent_backports.raxml_v730,
        "parsinsert": qiime.pycogent_backports.parsinsert,
        "pplacer": qiime.pycogent_backports.pplacer,
    }

    # load input sequences and convert to phylip since the tools require
    # the query sequences to phylip-compliant names
    load_aln = MinimalFastaParser(open(opts.input_fasta_fp, "U"))
    aln = DenseAlignment(load_aln)
    seqs, align_map = aln.toPhylip()

    if opts.method_params_fp:
        param_dict = parse_qiime_parameters(open(opts.method_params_fp, "U"))

    if module == "raxml_v730":
        # load the reference sequences
        load_ref_aln = DenseAlignment(MinimalFastaParser(open(opts.refseq_fp, "U")))

        # combine and load the reference plus query
        combined_aln = MinimalFastaParser(StringIO(load_ref_aln.toFasta() + "\n" + aln.toFasta()))
        # overwrite the alignment map
        aln = DenseAlignment(combined_aln)
        seqs, align_map = aln.toPhylip()

        try:
            parameters = param_dict["raxml"]
        except:
            parameters = {}

        tree = convert_tree_tips(align_map, opts.starting_tree_fp)

        # write out the tree with phylip labels
        updated_tree_fp = join(output_dir, "%s_phylip_named_tree.tre" % (module))
        write_updated_tree_file(updated_tree_fp, tree)

        # set the primary parameters for raxml
        parameters["-w"] = abspath(output_dir) + "/"
        parameters["-n"] = split(splitext(get_tmp_filename())[0])[-1]
        parameters["-t"] = updated_tree_fp

        if "-f" not in parameters:
            parameters["-f"] = "v"
        if "-m" not in parameters:
            parameters["-m"] = "GTRGAMMA"

    elif module == "pplacer":
        try:
            parameters = param_dict["pplacer"]
        except:
            parameters = {}

        # make sure stats file is passed
        if not opts.stats_fp:
            raise IOError, "When using pplacer, the RAxML produced info file is required."

        # set the primary parameters for pplacer - allow for user-defined
        parameters["--out-dir"] = abspath(output_dir) + "/"
        parameters["-t"] = opts.starting_tree_fp
        parameters["-r"] = opts.refseq_fp
        parameters["-s"] = opts.stats_fp

    elif module == "parsinsert":
        try:
            parameters = param_dict["parsinsert"]
        except:
            parameters = {}

        # define log fp
        log_fp = join(output_dir, "parsinsert.log")

        # define tax assignment values fp
        tax_assign_fp = join(output_dir, "parsinsert_assignments.log")
        parameters["-l"] = log_fp
        parameters["-o"] = tax_assign_fp
        parameters["-s"] = opts.refseq_fp
        parameters["-t"] = opts.starting_tree_fp

    # call the module and return a tree object
    result = tree_insertion_module_names[module].insert_sequences_into_tree(seqs, moltype=DNA, params=parameters)

    result_tree = strip_and_rename_unwanted_labels_from_tree(align_map, result)

    # write out the resulting tree
    final_tree = join(output_dir, "%s_final_placement.tre" % (module))
    write_updated_tree_file(final_tree, result)