Python Fasta.Alignment Examples, Fasta.Alignment, barque Python Examples

Example #1

0

Show file

File: 20080123b.py Project: BIGtigr/xgcode

def get_response_content(fs):
    # get the tree
    tree = Newick.parse(fs.tree, Newick.NewickTree)
    tree.assert_valid()
    # get the alignment
    try:
        alignment = Fasta.Alignment(fs.fasta.splitlines())
        alignment.force_nucleotide()
    except Fasta.AlignmentError as e:
        raise HandlingError(e)
    # define the jukes cantor rate matrix
    dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix()
    ordered_states = list('ACGT')
    row_major_rate_matrix = MatrixUtil.dict_to_row_major(
        dictionary_rate_matrix, ordered_states, ordered_states)
    rate_matrix_object = RateMatrix.RateMatrix(row_major_rate_matrix,
                                               ordered_states)
    # simulate the ancestral alignment
    try:
        alignment = PhyLikelihood.simulate_ancestral_alignment(
            tree, alignment, rate_matrix_object)
    except PhyLikelihood.SimulationError as e:
        raise HandlingError(e)
    # get the alignment string using an ordering defined by the tree
    arr = []
    for node in tree.preorder():
        arr.append(alignment.get_fasta_sequence(node.name))
    # return the response
    return '\n'.join(arr) + '\n'

Example #2

0

Show file

File: 20080819a.py Project: BIGtigr/xgcode

def get_response_content(fs):
    # read the nucleotide weights
    nt_weights = [fs.A, fs.C, fs.G, fs.T]
    # convert the nucleotide weights to probabilities
    nt_probs = [x / float(sum(nt_weights)) for x in nt_weights]
    # Assert that the kappa value and the nucleotide
    # probabilities are compatible.
    A, C, G, T = nt_probs
    R = float(A + G)
    Y = float(C + T)
    if R <= 0:
        raise HandlingError('the frequency of a purine must be positive')
    if Y <= 0:
        raise HandlingError('the frequency of a pyrimidine must be positive')
    if fs.kappa <= max(-Y, -R):
        msg_a = 'kappa must be greater than max(-R, -Y) '
        msg_b = 'where R and Y are the purine and pyrimidine frequencies'
        raise HandlingError(msg_a + msg_b)
    # Create the rate matrix object
    # which is automatically scaled to a rate of 1.0.
    model = F84.create_rate_matrix(fs.kappa, nt_probs)
    # simulate a pair of sequences
    sequence_pair = PairLikelihood.simulate_sequence_pair(
        fs.distance, model, fs.length)
    # convert the pair of sequences to an alignment object
    aln = StringIO()
    print >> aln, '>first'
    print >> aln, ''.join(sequence_pair[0])
    print >> aln, '>second'
    print >> aln, ''.join(sequence_pair[1])
    return Fasta.Alignment(StringIO(aln.getvalue())).to_fasta_string() + '\n'

Example #3

0

Show file

File: PhyLikelihood.py Project: BIGtigr/xgcode

def simulate_ancestral_alignment(tree, alignment, substitution_model):
    """
    @param tree: a newick tree with branch lengths
    @param alignment: a Fasta Alignment object with headers that match the tree tip names
    @param substitution_model: a way to simulate ancestral states from a tree given its leaf states
    @return: a Fasta Alignment object of the simulated ancestral sequences
    """
    for node in tree.gen_non_root_nodes():
        if node.get_branch_length() is None or node.get_branch_length() <= 0:
            raise SimulationError('all branch lengths should be positive')
    for node in tree.gen_internal_nodes():
        if not node.name:
            raise SimulationError('all internal nodes should be named')
    simulated_ancestors = dict((node.name, []) for node in tree.gen_internal_nodes())
    for col in alignment.columns:
        name_to_letter = dict(zip(alignment.headers, col))
        # Augment each tip with its corresponding letter.
        for tip in tree.gen_tips():
            tip.state = name_to_letter[tip.name]
        # Do the simulation.
        substitution_model.simulate_ancestral_states(tree)
        name_state_pairs = [(node.name, node.state) for node in tree.gen_internal_nodes_preorder()]
        # Add this simulated column.
        for name, state in name_state_pairs:
            simulated_ancestors[name].append(state)
    # Create an alignment object from the simulated sequences.
    sio = StringIO()
    print >> sio, alignment.to_fasta_string()
    for header, sequence in simulated_ancestors.items():
        print >> sio, '>' + header
        print >> sio, ''.join(sequence)
    fasta_string = sio.getvalue()
    return Fasta.Alignment(StringIO(fasta_string))

Example #4

0

Show file

def main():
    # create the alignment object
    print 'creating the alignment...'
    alignment_string = Fasta.brown_example_alignment.strip()
    alignment = Fasta.Alignment(StringIO(alignment_string))
    # create a tree object
    print 'creating the tree...'
    tree_string = Newick.brown_example_tree
    tree = Newick.parse(tree_string, Newick.NewickTree)
    # create a rate matrix object
    print 'creating the rate matrix object...'
    distribution = {'A': .25, 'C': .25, 'G': .25, 'T': .25}
    kappa = 2.0
    row_major_rate_matrix = RateMatrix.get_unscaled_hky85_rate_matrix(
        distribution, kappa).get_row_major_rate_matrix()
    rate_matrix = RateMatrix.FastRateMatrix(row_major_rate_matrix,
                                            list('ACGT'))
    rate_matrix.normalize()
    # get the mle_rates
    print 'getting the mle rates...'
    mle_rates = get_mle_rates(tree, alignment, rate_matrix)
    print 'mle rates:'
    print mle_rates
    print 'stockholm string:'
    print get_stockholm_string(tree, alignment, mle_rates)

Example #5

0

Show file

def get_response_content(fs):
    # get the tree
    tree = Newick.parse(fs.tree, Newick.NewickTree)
    tree.assert_valid()
    # get the mixture weights
    weights = [fs.weight_a, fs.weight_b, fs.weight_c]
    # get the matrices
    matrices = [fs.matrix_a, fs.matrix_b, fs.matrix_c]
    for R in matrices:
        if R.shape != (4, 4):
            msg = 'expected each nucleotide rate matrix to be 4x4'
            raise HandlingError(msg)
    # get the nucleotide alignment
    try:
        alignment = Fasta.Alignment(fs.alignment.splitlines())
        alignment.force_nucleotide()
    except Fasta.AlignmentError as e:
        raise HandlingError(e)
    # create the mixture proportions
    weight_sum = sum(weights)
    mixture_proportions = [weight / weight_sum for weight in weights]
    # create the rate matrix objects
    ordered_states = list('ACGT')
    rate_matrix_objects = []
    for R in matrices:
        rate_matrix_object = RateMatrix.RateMatrix(R.tolist(), ordered_states)
        rate_matrix_objects.append(rate_matrix_object)
    # create the mixture model
    mixture_model = SubModel.MixtureModel(mixture_proportions,
                                          rate_matrix_objects)
    # normalize the mixture model
    mixture_model.normalize()
    # return the html string
    return do_analysis(mixture_model, alignment, tree) + '\n'

Example #6

0

Show file

def get_response_content(fs):
    # read the alignment
    try:
        alignment = Fasta.Alignment(fs.fasta.splitlines())
    except Fasta.AlignmentError as e:
        raise HandlingError('fasta alignment error: ' + str(e))
    if alignment.get_sequence_count() != 2:
        raise HandlingError('expected a sequence pair')
    # read the rate matrix
    R = fs.matrix
    # read the ordered states
    ordered_states = Util.get_stripped_lines(fs.states.splitlines())
    if len(ordered_states) != len(R):
        msg_a = 'the number of ordered states must be the same '
        msg_b = 'as the number of rows in the rate matrix'
        raise HandlingError(msg_a + msg_b)
    if len(set(ordered_states)) != len(ordered_states):
        raise HandlingError('the ordered states must be unique')
    # create the rate matrix object using the ordered states
    rate_matrix_object = RateMatrix.RateMatrix(R.tolist(), ordered_states) 
    # create the objective function
    objective = Objective(alignment.sequences, rate_matrix_object)
    # Use golden section search to find the mle distance.
    # The bracket is just a suggestion.
    bracket = (0.51, 2.01)
    mle_distance = optimize.golden(objective, brack=bracket)
    # write the response
    out = StringIO()
    print >> out, 'maximum likelihood distance:', mle_distance
    #distances = (mle_distance, 0.2, 2.0, 20.0)
    #for distance in distances:
        #print >> out, 'f(%s): %s' % (distance, objective(distance))
    return out.getvalue()

Example #7

0

Show file

File: PhyLikelihood.py Project: BIGtigr/xgcode

 def test_simulation(self):
     tree_string = '(((Human:0.1, Chimpanzee:0.2)to-chimp:0.8, Gorilla:0.3)to-gorilla:0.7, Orangutan:0.4, Gibbon:0.5)all;'
     # Parse the example tree.
     tree = Newick.parse(tree_string, Newick.NewickTree)
     tree.assert_valid()
     # Get header and sequence pairs.
     alignment = Fasta.Alignment(StringIO(Fasta.brown_example_alignment))
     # Get the Jukes-Cantor rate matrix object.
     dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix()
     ordered_states = list('ACGT')
     row_major_rate_matrix = MatrixUtil.dict_to_row_major(dictionary_rate_matrix, ordered_states, ordered_states)
     rate_matrix_object = RateMatrix.RateMatrix(row_major_rate_matrix, ordered_states)
     # Simulate ancestral states.
     simulated_alignment = simulate_ancestral_alignment(tree, alignment, rate_matrix_object)

Example #8

0

Show file

File: PhyLikelihood.py Project: BIGtigr/xgcode

 def test_likelihood(self):
     # Parse the example tree.
     tree_string = Newick.brown_example_tree
     tree = Newick.parse(tree_string, Newick.NewickTree)
     tree.assert_valid()
     # Get header and sequence pairs.
     alignment = Fasta.Alignment(StringIO(Fasta.brown_example_alignment))
     # Get the Jukes-Cantor rate matrix object.
     dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix()
     ordered_states = list('ACGT')
     row_major_rate_matrix = MatrixUtil.dict_to_row_major(dictionary_rate_matrix, ordered_states, ordered_states)
     rate_matrix_object = RateMatrix.RateMatrix(row_major_rate_matrix, ordered_states)
     # Calculate the log likelihood.
     log_likelihood = get_log_likelihood(tree, alignment, rate_matrix_object)
     self.assertAlmostEqual(log_likelihood, -4146.26547208)

Example #9

0

Show file

def get_response_content(fs):
    # get the tree
    tree = Newick.parse(fs.tree, Newick.NewickTree)
    tree.assert_valid()
    # get the nucleotide alignment
    try:
        alignment = Fasta.Alignment(fs.alignment.splitlines())
        alignment.force_nucleotide()
    except Fasta.AlignmentError as e:
        raise HandlingError(e)
    # get the normalized Direct RNA mixture model
    mixture_model = DirectRna.deserialize_mixture_model(fs.model)
    mixture_model.normalize()
    # return the html string
    return do_analysis(mixture_model, alignment, tree) + '\n'

Example #10

0

Show file

def get_response_content(fs):
    # read the alignment
    try:
        alignment = Fasta.Alignment(fs.fasta.splitlines())
    except Fasta.AlignmentError as e:
        raise HandlingError('fasta alignment error: ' + str(e))
    if alignment.get_sequence_count() < 2:
        raise HandlingError('expected at least two sequences')
    # Create the distance matrix,
    # replacing values of None with the representation for infinity.
    row_major_distance_matrix = []
    for row in JC69.get_ML_distance_matrix(alignment.sequences):
        corrected_row = [fs.infinity if x == float('inf') else x for x in row]
        row_major_distance_matrix.append(corrected_row)
    # return the response
    return MatrixUtil.m_to_string(row_major_distance_matrix) + '\n'

Example #11

0

Show file

def get_response_content(fs):
    """
    @param fs: a FieldStorage object containing the cgi arguments
    @return: a (response_headers, response_text) pair
    """
    # read the alignment
    try:
        alignment = Fasta.Alignment(StringIO(fs.fasta))
    except Fasta.AlignmentError as e:
        raise HandlingError('fasta alignment error: ' + str(e))
    if alignment.get_sequence_count() < 2:
        raise HandlingError('expected at least two sequences')
    # read the rate matrix
    R = fs.matrix
    # read the ordered states
    ordered_states = Util.get_stripped_lines(StringIO(fs.states))
    if len(ordered_states) != len(R):
        msg_a = 'the number of ordered states must be the same '
        msg_b = 'as the number of rows in the rate matrix'
        raise HandlingError(msg_a + msg_b)
    if len(set(ordered_states)) != len(ordered_states):
        raise HandlingError('the ordered states must be unique')
    # create the rate matrix object using the ordered states
    rate_matrix_object = RateMatrix.RateMatrix(R.tolist(), ordered_states)
    # create the distance matrix
    n = alignment.get_sequence_count()
    row_major_distance_matrix = [[0] * n for i in range(n)]
    for i, sequence_a in enumerate(alignment.sequences):
        for j, sequence_b in enumerate(alignment.sequences):
            if i < j:
                # create the objective function using the sequence pair
                objective = Objective((sequence_a, sequence_b),
                                      rate_matrix_object)
                # Use golden section search to find the mle distance.
                # The bracket is just a suggestion.
                bracket = (0.51, 2.01)
                mle_distance = optimize.golden(objective, brack=bracket)
                # fill two elements of the matrix
                row_major_distance_matrix[i][j] = mle_distance
                row_major_distance_matrix[j][i] = mle_distance
    # write the response
    out = StringIO()
    print >> out, 'maximum likelihood distance matrix:'
    print >> out, MatrixUtil.m_to_string(row_major_distance_matrix)
    return out.getvalue()

Example #12

0

Show file

File: 20080821a.py Project: BIGtigr/xgcode

def get_response_content(fs):
    # get the alignment object
    try:
        alignment = Fasta.Alignment(StringIO(fs.fasta))
    except Fasta.AlignmentError as e:
        raise HandlingError('alignment error: ' + str(e))
    # assert that the alignment is of exactly two sequences
    if len(alignment.sequences) != 2:
        raise HandlingError('expected a pair of sequences')
    # assert that the alignment is a gapless unambiguous nucleotide alignment
    old_column_count = alignment.get_column_count()
    try:
        alignment.force_nucleotide()
    except Fasta.AlignmentError as e:
        raise HandlingError('nucleotide alignment error: ' + str(e))
    new_column_count = alignment.get_column_count()
    if old_column_count != new_column_count:
        msg = 'expected a gapless unambiguous nucleotide alignment'
        raise HandlingError(msg)
    # get the maximum likelihood estimates according to a numeric optimizer.
    f = F84.Objective(alignment.sequences)
    values = list(f.get_initial_parameters())
    result = scipy.optimize.fmin(f, values, ftol=1e-10, disp=0)
    distance, kappa, wC, wG, wT = result
    nt_distribution = F84.parameters_to_distribution((wC, wG, wT))
    A, C, G, T = nt_distribution
    model = F84.create_rate_matrix(kappa, nt_distribution)
    log_likelihood = PairLikelihood.get_log_likelihood(distance,
                                                       alignment.sequences,
                                                       model)
    # begin the response
    out = StringIO()
    print >> out, 'ML distance:', distance
    print >> out, 'ML kappa:', kappa
    print >> out, 'ML A frequency:', A
    print >> out, 'ML C frequency:', C
    print >> out, 'ML G frequency:', G
    print >> out, 'ML T frequency:', T
    print >> out, 'log likelihood:', log_likelihood
    # write the response
    return out.getvalue()

Example #13

0

Show file

File: 20080122a.py Project: BIGtigr/xgcode

def get_response_content(fs):
    # get the tree
    tree = Newick.parse(fs.tree, Newick.NewickTree)
    tree.assert_valid()
    # get the alignment
    try:
        alignment = Fasta.Alignment(fs.fasta.splitlines())
        alignment.force_nucleotide()
    except Fasta.AlignmentError as e:
        raise HandlingError(e)
    # get the log likelihood
    dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix()
    ordered_states = list('ACGT')
    row_major_rate_matrix = MatrixUtil.dict_to_row_major(
            dictionary_rate_matrix, ordered_states, ordered_states)
    rate_matrix_object = RateMatrix.RateMatrix(
            row_major_rate_matrix, ordered_states)
    log_likelihood = PhyLikelihood.get_log_likelihood(
            tree, alignment, rate_matrix_object)
    # return the response
    return str(log_likelihood) + '\n'

Example #14

0

Show file

File: PhyLikelihood.py Project: BIGtigr/xgcode

def simulate_alignment(tree, substitution_model, ncolumns, seed=None):
    """
    @param tree: a newick tree with branch lengths
    @param substitution_model: a way to simulate states on a tree
    @param ncolumns: the number of columns to simulate
    @param seed: a random number seed
    @return: a Fasta Alignment object of the simulated sequences
    """
    # Check the input.
    for node in tree.gen_non_root_nodes():
        if node.get_branch_length() is None or node.get_branch_length() <= 0:
            raise SimulationError('all branch lengths should be positive')
    tip_names = [node.name for node in tree.gen_tips()]
    for name in tip_names:
        if not name:
            raise SimulationError('each leaf should have a name')
    if len(tip_names) != len(set(tip_names)):
        raise SimulationError('each leaf should have a unique name')
    # Save the rng state if we are using a seed.
    if seed is not None:
        old_rng_state = random.getstate()
    # Seed the rng if we are using a seed.
    if seed is not None:
        random.seed(seed)
    # Simulate the states on the tree.
    simulated_sequences = dict((node.name, []) for node in tree.gen_tips())
    for column_index in range(ncolumns):
        substitution_model.simulate_states(tree)
        for node in tree.gen_tips():
            simulated_sequences[node.name].append(node.state)
    # Restore the rng state if we are using a seed
    if seed is not None:
        random.setstate(old_rng_state)
    # Create an alignment object from the simulated sequences.
    sio = StringIO()
    for header, sequence in simulated_sequences.items():
        print >> sio, '>' + header
        print >> sio, ''.join(sequence)
    fasta_string = sio.getvalue()
    return Fasta.Alignment(StringIO(fasta_string))

Example #15

0

Show file

def get_response_content(fs):
    # get the alignment object
    try:
        alignment = Fasta.Alignment(fs.fasta.splitlines())
    except Fasta.AlignmentError as e:
        raise HandlingError('alignment error: ' + str(e))
    # assert that the alignment is of exactly two sequences
    if len(alignment.sequences) != 2:
        raise HandlingError('expected a pair of sequences')
    # assert that the alignment is a gapless unambiguous nucleotide alignment
    old_column_count = alignment.get_column_count()
    try:
        alignment.force_nucleotide()
    except Fasta.AlignmentError as e:
        raise HandlingError('nucleotide alignment error: ' + str(e))
    new_column_count = alignment.get_column_count()
    if old_column_count != new_column_count:
        msg = 'expected a gapless unambiguous nucleotide alignment'
        raise HandlingError(msg)
    # get the maximum likelihood estimates
    sequence_pair = alignment.sequences
    distance, kappa, A, C, G, T = F84.get_closed_form_estimates(sequence_pair)
    # get the log likelihood
    nt_distribution = (A, C, G, T)
    rate_matrix_object = F84.create_rate_matrix(kappa, nt_distribution)
    log_likelihood = PairLikelihood.get_log_likelihood(distance,
                                                       alignment.sequences,
                                                       rate_matrix_object)
    # begin the response
    out = StringIO()
    print >> out, 'distance:', distance
    print >> out, 'kappa:', kappa
    print >> out, 'A frequency:', A
    print >> out, 'C frequency:', C
    print >> out, 'G frequency:', G
    print >> out, 'T frequency:', T
    print >> out, 'log likelihood:', log_likelihood
    # return the response
    return out.getvalue()

Example #16

0

Show file

def get_response_content(fs):
    # get the tree
    tree = Newick.parse(fs.tree, Newick.NewickTree)
    tree.assert_valid()
    # get the nucleotide distribution
    distribution = SnippetUtil.get_distribution(fs.weights, 'nucleotide',
                                                list('ACGT'))
    # get the nucleotide alignment
    try:
        alignment = Fasta.Alignment(StringIO(fs.alignment))
        alignment.force_nucleotide()
    except Fasta.AlignmentError as e:
        raise HandlingError(e)
    # get the rate matrix defined by the nucleotide distribution and kappa
    row_major_rate_matrix = RateMatrix.get_unscaled_hky85_rate_matrix(
        distribution, fs.kappa).get_row_major_rate_matrix()
    rate_matrix = RateMatrix.FastRateMatrix(row_major_rate_matrix,
                                            list('ACGT'))
    rate_matrix.normalize()
    # get the mle rates
    mle_rates = get_mle_rates(tree, alignment, rate_matrix)
    # return the response
    return get_stockholm_string(tree, alignment, mle_rates) + '\n'

Example #17

0

Show file

File: 20080119b.py Project: BIGtigr/xgcode

def get_response_content(fs):
    out = StringIO()
    try:
        alignment = Fasta.Alignment(fs.fasta.splitlines())
        print >> out, 'This is a valid alignment.'
    except Fasta.AlignmentError as e:
        alignment = None
        print >> out, 'This is not a valid alignment:', e
    if alignment:
        try:
            old_column_count = len(alignment.columns)
            alignment.force_nucleotide()
            removed_column_count = old_column_count - len(alignment.columns)
            if removed_column_count:
                print >> out, ('After removing %d' % removed_column_count),
                print >> out, 'columns this is a valid nucleotide alignment.'
            else:
                print >> out, 'This is a valid nucleotide alignment.'
        except Fasta.AlignmentError as e:
            print >> out, 'This is not a valid nucleotide alignment:', e
    for header, seq in Fasta.gen_header_sequence_pairs(StringIO(fs.fasta)):
        print >> out, '%s: %d' % (header, len(seq))
    return out.getvalue()

Example #18

0

Show file

def get_response_content(fs):
    # get the alignment object
    try:
        alignment = Fasta.Alignment(fs.fasta.splitlines())
    except Fasta.AlignmentError as e:
        raise HandlingError('alignment error: ' + str(e))
    # assert that the alignment is of exactly two sequences
    if len(alignment.sequences) != 2:
        raise HandlingError('expected a pair of sequences')
    # assert that the alignment is a gapless unambiguous nucleotide alignment
    old_column_count = alignment.get_column_count()
    try:
        alignment.force_nucleotide()
    except Fasta.AlignmentError as e:
        raise HandlingError('nucleotide alignment error: ' + str(e))
    new_column_count = alignment.get_column_count()
    if old_column_count != new_column_count:
        msg = 'expected a gap-free unambiguous nucleotide alignment'
        raise HandlingError(msg)
    # get the maximum likelihood estimate
    sequence_pair = alignment.sequences
    mle = JC69.get_ML_distance(*sequence_pair)
    # return the response
    return 'ML distance estimate: %f\n' % mle

Example #19

0

Show file

File: 20080826c.py Project: BIGtigr/xgcode

def get_response_content(fs):
    # get the alignment object
    try:
        alignment = Fasta.Alignment(fs.fasta.splitlines())
    except Fasta.AlignmentError as e:
        raise HandlingError('alignment error: ' + str(e))
    # assert that the alignment is of exactly two sequences
    if len(alignment.sequences) != 2:
        raise HandlingError('expected a pair of sequences')
    # assert that the alignment is a gapless unambiguous nucleotide alignment
    old_column_count = alignment.get_column_count()
    try:
        alignment.force_nucleotide()
    except Fasta.AlignmentError as e:
        raise HandlingError('nucleotide alignment error: ' + str(e))
    new_column_count = alignment.get_column_count()
    if old_column_count != new_column_count:
        msg = 'expected a gapless unambiguous nucleotide alignment'
        raise HandlingError(msg)
    # get the maximum likelihood estimate
    sequence_pair = alignment.sequences
    count = sum(a != b for a, b in zip(*alignment.sequences))
    # return the response
    return 'difference count: %d\n' % count

Example #20

0

Show file

 def load(self, lines):
     """
     @param lines: lines of nexus data
     """
     # get the taxa, tree, and character lines
     taxa_lines = []
     tree_lines = []
     character_lines = []
     current_array = None
     for line in iterutils.stripped_lines(lines):
         # Ignore an entire line that is a comment.
         # Nested comments and multi-line comments
         # are not correctly processed here.
         if line.startswith('[') and line.endswith(']'):
             self.add_comment(line[1:-1])
             continue
         tokens = line.upper().split()
         if tokens == ['BEGIN', 'TAXA;']:
             current_array = taxa_lines
         elif tokens == ['BEGIN', 'TREES;']:
             current_array = tree_lines
         elif tokens == ['BEGIN', 'CHARACTERS;']:
             current_array = character_lines
         elif tokens == ['END;']:
             current_array = None
         elif current_array is not None:
             current_array.append(line)
     # assert that tree lines and character lines are present
     if not tree_lines:
         raise NexusError('TREES was not found')
     if not character_lines:
         raise NexusError('CHARACTERS was not found')
     # read the newick tree string
     nexus_tree_string = ''.join(tree_lines)
     if nexus_tree_string.count(';') != 1:
         raise NexusError(
             'expected exactly one semicolon in the nexus TREES block')
     if nexus_tree_string.count('=') != 1:
         raise NexusError(
             'expected exactly one equals sign in the nexus TREES block')
     offset = nexus_tree_string.find('=')
     newick_string = nexus_tree_string[offset + 1:]
     self.tree = Newick.parse(newick_string, Newick.NewickTree)
     # read the alignment matrix
     arr = []
     found_matrix = False
     for line in character_lines:
         if line.upper().startswith('DIMENSIONS'):
             continue
         if line.upper().startswith('FORMAT'):
             continue
         if line.upper().startswith('MATRIX'):
             found_matrix = True
             continue
         if found_matrix:
             arr.append(line.replace(';', ' '))
     if not arr:
         raise NexusError('no alignment was found')
     tokens = ' '.join(arr).split()
     if len(tokens) % 2 != 0:
         raise NexusError(
             'expected the alignment to be a list of (taxon, sequence) pairs'
         )
     alignment_out = StringIO()
     for header, sequence in iterutils.chopped(tokens, 2):
         sequence = sequence.upper()
         unexpected_letters = set(sequence) - set('ACGT')
         if unexpected_letters:
             raise NexusError('unexpected sequence character(s): %s' %
                              list(unexpected_letters))
         print >> alignment_out, '>%s' % header
         print >> alignment_out, sequence
     alignment_string = alignment_out.getvalue()
     self.alignment = Fasta.Alignment(StringIO(alignment_string))

Example #21

0

Show file

File: Phylip.py Project: BIGtigr/xgcode

def make_sample_alignment():
    """
    Make a sample alignment object.
    """
    return Fasta.Alignment(Fasta.example_fasta_aligned.splitlines())