Beispiel #1
0
def do_analysis_helper(labels, element_lists, w):
    """
    Chop up the rows of data.
    Yield lines of text to be displayed in an html pre tag.
    @param labels: row labels to be left justified
    @param element_lists: data rows where each element is a letter or a span
    @param w: the width; the number of elements allowed per page row
    """
    if len(set(len(element_list) for element_list in element_lists)) != 1:
        msg = 'each element list should have the same nonzero length'
        raise ValueError(msg)
    label_width = max(len(label) for label in labels) + 1
    chopped_element_lists = [
        list(iterutils.chopped(element_list, w))
        for element_list in element_lists
    ]
    page_rows = zip(*chopped_element_lists)
    for i, page_row in enumerate(page_rows):
        header = ''
        header += ' ' * label_width
        header += Monospace.get_ruler_line(i * w + 1, i * w + len(page_row[0]))
        yield header
        for label, element_list in zip(labels, page_row):
            justified_label = label.ljust(label_width)
            yield ''.join([justified_label] + list(element_list))
        if i < len(page_rows) - 1:
            yield ''
Beispiel #2
0
def do_analysis_helper(labels, element_lists, w):
    """
    Chop up the rows of data.
    Yield lines of text to be displayed in an html pre tag.
    @param labels: row labels to be left justified
    @param element_lists: data rows where each element is a letter or a span
    @param w: the width; the number of elements allowed per page row
    """
    if len(set(len(element_list) for element_list in element_lists)) != 1:
        msg = 'each element list should have the same nonzero length'
        raise ValueError(msg)
    label_width = max(len(label) for label in labels) + 1
    chopped_element_lists = [list(iterutils.chopped(element_list, w))
            for element_list in element_lists]
    page_rows = zip(*chopped_element_lists)
    for i, page_row in enumerate(page_rows):
        header = ''
        header += ' ' * label_width
        header += Monospace.get_ruler_line(i*w + 1, i*w + len(page_row[0]))
        yield header
        for label, element_list in zip(labels, page_row):
            justified_label = label.ljust(label_width)
            yield ''.join([justified_label] + list(element_list))
        if i < len(page_rows) - 1:
            yield ''
Beispiel #3
0
 def _init_codon_to_count(self):
     # parse the raw string
     codon_pattern = r'([ACGU][ACGU][ACGU])(.*)\((.*)\)'
     line_pattern = '.*'.join([codon_pattern] * 4)
     token_lists = []
     for line in StringIO(_raw_human_codon_count_string):
         line = line.strip()
         if line:
             m = re.search(line_pattern, line)
             token_list = [x.strip() for x in m.groups()]
             assert len(token_list) == 12
             token_lists.append(token_list)
     assert len(token_lists) == 16
     # write the dictionary using the tokens
     for token_list in token_lists:
         for token in token_list:
             assert type(token) == str
         for codon, per_thousand, count in iterutils.chopped(token_list, 3):
             # validate the codon
             assert len(codon) == 3
             assert set(codon) <= set('ACGU')
             # use the dna sense codon instead of the rna codon
             codon = codon.replace('U', 'T')
             # validate the count
             assert set(count) <= set('0123456789')
             count = int(count)
             assert count > 0
             self._codon_to_count[codon] = count
     assert len(self._codon_to_count) == 64
Beispiel #4
0
 def get_fasta_sequence(self, header, ncols=60):
     """
     @param ncols: the maximum number of residues per line
     @return: a string representing the header and sequence in fasta format
     """
     header_to_sequence = dict(zip(self.headers, self.sequences))
     sequence = header_to_sequence[header]
     arr = []
     arr.append('>' + header)
     print sequence
     arr.append('\n'.join(iterutils.chopped(sequence, ncols)))
     return '\n'.join(arr)
Beispiel #5
0
def get_alignment_string_non_interleaved(alignment):
    """
    @param alignment: a fasta alignment object
    @return: a non interleaved phylip alignment string
    """
    out = StringIO()
    # print the number of sequences and the length of each sequence
    print >> out, len(alignment.headers), len(alignment.sequences[0])
    # print each sequence
    for header, sequence in zip(alignment.headers, alignment.sequences):
        print >> out, header
        for segment in iterutils.chopped(sequence, 60):
            print >> out, segment
    return out.getvalue().strip()
Beispiel #6
0
def get_alignment_string_non_interleaved(alignment):
    """
    @param alignment: a fasta alignment object
    @return: a non interleaved phylip alignment string
    """
    out = StringIO()
    # print the number of sequences and the length of each sequence
    print >> out, len(alignment.headers), len(alignment.sequences[0])
    # print each sequence
    for header, sequence in zip(alignment.headers, alignment.sequences):
        print >> out, header
        for segment in iterutils.chopped(sequence, 60):
            print >> out, segment
    return out.getvalue().strip()
Beispiel #7
0
 def __init__(self, lines):
     """
     @param lines: lines of a fasta file
     """
     if not lines:
         raise AlignmentError('no fasta lines were provided')
     header_sequence_pairs = list(gen_header_sequence_pairs(lines))
     self.headers = [header for header, sequence in header_sequence_pairs]
     nucleotide_sequences = [
         seq.upper() for header, seq in header_sequence_pairs
     ]
     if not nucleotide_sequences:
         raise AlignmentError('no nucleotide sequences were found')
     for header in self.headers:
         if not header:
             raise AlignmentError('each sequence should have a header')
     if len(set(len(sequence) for sequence in nucleotide_sequences)) != 1:
         raise AlignmentError('not all sequences are the same length')
     for seq in nucleotide_sequences:
         invalid_states = set(seq) - set('ACGT-')
         if invalid_states:
             example_invalid_state = invalid_states.pop()
             raise CodonAlignmentError('invalid nucleotide: %s' %
                                       example_invalid_state)
     nucleotide_columns = zip(*nucleotide_sequences)
     if len(nucleotide_columns) % 3 != 0:
         raise CodonAlignmentError(
             'the number of aligned nucleotide columns should be a multiple of three'
         )
     gappy_codon_sequences = [
         list(iterutils.chopped(seq, 3)) for seq in nucleotide_sequences
     ]
     if not gappy_codon_sequences:
         raise CodonAlignmentError('no codon sequences were found')
     observed_gappy_codons = set(
         itertools.chain.from_iterable(gappy_codon_sequences))
     valid_gappy_codons = set(list(Codon.g_non_stop_codons) + ['---'])
     invalid_gappy_codons = observed_gappy_codons - valid_gappy_codons
     if invalid_gappy_codons:
         example_invalid_codon = invalid_gappy_codons.pop()
         raise CodonAlignmentError('invalid codon: %s' %
                                   example_invalid_codon)
     self.columns = [
         col for col in zip(*gappy_codon_sequences) if '---' not in col
     ]
     if not self.columns:
         raise CodonAlignmentError('no ungapped codon columns were found')
     self.sequences = zip(*self.columns)
Beispiel #8
0
def get_alignment_string_interleaved(alignment):
    """
    @param alignment: a fasta alignment object
    @return: an interleaved phylip alignment string
    """
    chopped_sequences = [
            iterutils.chopped(seq, 60) for seq in alignment.sequences]
    bands = zip(*chopped_sequences)
    out = StringIO()
    print >> out, len(alignment.headers), len(alignment.sequences[0])
    lengths = [9] + [len(header) for header in alignment.headers]
    n = max(lengths)
    for header, segment in zip(alignment.headers, bands[0]):
        aligned_header = header.ljust(n)
        print >> out, '%s  %s' % (aligned_header, segment)
    print >> out, ''
    for band in bands[1:]:
        for segment in band:
            print >> out, segment
        print >> out, ''
    return out.getvalue().strip()
Beispiel #9
0
def get_alignment_string_interleaved(alignment):
    """
    @param alignment: a fasta alignment object
    @return: an interleaved phylip alignment string
    """
    chopped_sequences = [
        iterutils.chopped(seq, 60) for seq in alignment.sequences
    ]
    bands = zip(*chopped_sequences)
    out = StringIO()
    print >> out, len(alignment.headers), len(alignment.sequences[0])
    lengths = [9] + [len(header) for header in alignment.headers]
    n = max(lengths)
    for header, segment in zip(alignment.headers, bands[0]):
        aligned_header = header.ljust(n)
        print >> out, '%s  %s' % (aligned_header, segment)
    print >> out, ''
    for band in bands[1:]:
        for segment in band:
            print >> out, segment
        print >> out, ''
    return out.getvalue().strip()
Beispiel #10
0
 def load(self, lines):
     """
     @param lines: lines of nexus data
     """
     # get the taxa, tree, and character lines
     taxa_lines = []
     tree_lines = []
     character_lines = []
     current_array = None
     for line in iterutils.stripped_lines(lines):
         # Ignore an entire line that is a comment.
         # Nested comments and multi-line comments
         # are not correctly processed here.
         if line.startswith('[') and line.endswith(']'):
             self.add_comment(line[1:-1])
             continue
         tokens = line.upper().split()
         if tokens == ['BEGIN', 'TAXA;']:
             current_array = taxa_lines
         elif tokens == ['BEGIN', 'TREES;']:
             current_array = tree_lines
         elif tokens == ['BEGIN', 'CHARACTERS;']:
             current_array = character_lines
         elif tokens == ['END;']:
             current_array = None
         elif current_array is not None:
             current_array.append(line)
     # assert that tree lines and character lines are present
     if not tree_lines:
         raise NexusError('TREES was not found')
     if not character_lines:
         raise NexusError('CHARACTERS was not found')
     # read the newick tree string
     nexus_tree_string = ''.join(tree_lines)
     if nexus_tree_string.count(';') != 1:
         raise NexusError(
             'expected exactly one semicolon in the nexus TREES block')
     if nexus_tree_string.count('=') != 1:
         raise NexusError(
             'expected exactly one equals sign in the nexus TREES block')
     offset = nexus_tree_string.find('=')
     newick_string = nexus_tree_string[offset + 1:]
     self.tree = Newick.parse(newick_string, Newick.NewickTree)
     # read the alignment matrix
     arr = []
     found_matrix = False
     for line in character_lines:
         if line.upper().startswith('DIMENSIONS'):
             continue
         if line.upper().startswith('FORMAT'):
             continue
         if line.upper().startswith('MATRIX'):
             found_matrix = True
             continue
         if found_matrix:
             arr.append(line.replace(';', ' '))
     if not arr:
         raise NexusError('no alignment was found')
     tokens = ' '.join(arr).split()
     if len(tokens) % 2 != 0:
         raise NexusError(
             'expected the alignment to be a list of (taxon, sequence) pairs'
         )
     alignment_out = StringIO()
     for header, sequence in iterutils.chopped(tokens, 2):
         sequence = sequence.upper()
         unexpected_letters = set(sequence) - set('ACGT')
         if unexpected_letters:
             raise NexusError('unexpected sequence character(s): %s' %
                              list(unexpected_letters))
         print >> alignment_out, '>%s' % header
         print >> alignment_out, sequence
     alignment_string = alignment_out.getvalue()
     self.alignment = Fasta.Alignment(StringIO(alignment_string))
Beispiel #11
0
 def load(self, lines):
     """
     @param lines: lines of nexus data
     """
     # get the taxa, tree, and character lines
     taxa_lines = []
     tree_lines = []
     character_lines = []
     current_array = None
     for line in iterutils.stripped_lines(lines):
         # Ignore an entire line that is a comment.
         # Nested comments and multi-line comments
         # are not correctly processed here.
         if line.startswith('[') and line.endswith(']'):
             self.add_comment(line[1:-1])
             continue
         tokens = line.upper().split()
         if tokens == ['BEGIN', 'TAXA;']:
             current_array = taxa_lines
         elif tokens == ['BEGIN', 'TREES;']:
             current_array = tree_lines
         elif tokens == ['BEGIN', 'CHARACTERS;']:
             current_array = character_lines
         elif tokens == ['END;']:
             current_array = None
         elif current_array is not None:
             current_array.append(line)
     # assert that tree lines and character lines are present
     if not tree_lines:
         raise NexusError('TREES was not found')
     if not character_lines:
         raise NexusError('CHARACTERS was not found')
     # read the newick tree string
     nexus_tree_string = ''.join(tree_lines)
     if nexus_tree_string.count(';') != 1:
         raise NexusError('expected exactly one semicolon in the nexus TREES block')
     if nexus_tree_string.count('=') != 1:
         raise NexusError('expected exactly one equals sign in the nexus TREES block')
     offset = nexus_tree_string.find('=')
     newick_string = nexus_tree_string[offset+1:]
     self.tree = Newick.parse(newick_string, Newick.NewickTree)
     # read the alignment matrix
     arr = []
     found_matrix = False
     for line in character_lines:
         if line.upper().startswith('DIMENSIONS'):
             continue
         if line.upper().startswith('FORMAT'):
             continue
         if line.upper().startswith('MATRIX'):
             found_matrix = True
             continue
         if found_matrix:
             arr.append(line.replace(';', ' '))
     if not arr:
         raise NexusError('no alignment was found')
     tokens = ' '.join(arr).split()
     if len(tokens) % 2 != 0:
         raise NexusError('expected the alignment to be a list of (taxon, sequence) pairs')
     alignment_out = StringIO()
     for header, sequence in iterutils.chopped(tokens, 2):
         sequence = sequence.upper()
         unexpected_letters = set(sequence) - set('ACGT')
         if unexpected_letters:
             raise NexusError('unexpected sequence character(s): %s' % list(unexpected_letters))
         print >> alignment_out, '>%s' % header
         print >> alignment_out, sequence
     alignment_string = alignment_out.getvalue()
     self.alignment = Fasta.Alignment(StringIO(alignment_string))