def do_analysis_helper(labels, element_lists, w): """ Chop up the rows of data. Yield lines of text to be displayed in an html pre tag. @param labels: row labels to be left justified @param element_lists: data rows where each element is a letter or a span @param w: the width; the number of elements allowed per page row """ if len(set(len(element_list) for element_list in element_lists)) != 1: msg = 'each element list should have the same nonzero length' raise ValueError(msg) label_width = max(len(label) for label in labels) + 1 chopped_element_lists = [ list(iterutils.chopped(element_list, w)) for element_list in element_lists ] page_rows = zip(*chopped_element_lists) for i, page_row in enumerate(page_rows): header = '' header += ' ' * label_width header += Monospace.get_ruler_line(i * w + 1, i * w + len(page_row[0])) yield header for label, element_list in zip(labels, page_row): justified_label = label.ljust(label_width) yield ''.join([justified_label] + list(element_list)) if i < len(page_rows) - 1: yield ''
def do_analysis_helper(labels, element_lists, w): """ Chop up the rows of data. Yield lines of text to be displayed in an html pre tag. @param labels: row labels to be left justified @param element_lists: data rows where each element is a letter or a span @param w: the width; the number of elements allowed per page row """ if len(set(len(element_list) for element_list in element_lists)) != 1: msg = 'each element list should have the same nonzero length' raise ValueError(msg) label_width = max(len(label) for label in labels) + 1 chopped_element_lists = [list(iterutils.chopped(element_list, w)) for element_list in element_lists] page_rows = zip(*chopped_element_lists) for i, page_row in enumerate(page_rows): header = '' header += ' ' * label_width header += Monospace.get_ruler_line(i*w + 1, i*w + len(page_row[0])) yield header for label, element_list in zip(labels, page_row): justified_label = label.ljust(label_width) yield ''.join([justified_label] + list(element_list)) if i < len(page_rows) - 1: yield ''
def _init_codon_to_count(self): # parse the raw string codon_pattern = r'([ACGU][ACGU][ACGU])(.*)\((.*)\)' line_pattern = '.*'.join([codon_pattern] * 4) token_lists = [] for line in StringIO(_raw_human_codon_count_string): line = line.strip() if line: m = re.search(line_pattern, line) token_list = [x.strip() for x in m.groups()] assert len(token_list) == 12 token_lists.append(token_list) assert len(token_lists) == 16 # write the dictionary using the tokens for token_list in token_lists: for token in token_list: assert type(token) == str for codon, per_thousand, count in iterutils.chopped(token_list, 3): # validate the codon assert len(codon) == 3 assert set(codon) <= set('ACGU') # use the dna sense codon instead of the rna codon codon = codon.replace('U', 'T') # validate the count assert set(count) <= set('0123456789') count = int(count) assert count > 0 self._codon_to_count[codon] = count assert len(self._codon_to_count) == 64
def get_fasta_sequence(self, header, ncols=60): """ @param ncols: the maximum number of residues per line @return: a string representing the header and sequence in fasta format """ header_to_sequence = dict(zip(self.headers, self.sequences)) sequence = header_to_sequence[header] arr = [] arr.append('>' + header) print sequence arr.append('\n'.join(iterutils.chopped(sequence, ncols))) return '\n'.join(arr)
def get_alignment_string_non_interleaved(alignment): """ @param alignment: a fasta alignment object @return: a non interleaved phylip alignment string """ out = StringIO() # print the number of sequences and the length of each sequence print >> out, len(alignment.headers), len(alignment.sequences[0]) # print each sequence for header, sequence in zip(alignment.headers, alignment.sequences): print >> out, header for segment in iterutils.chopped(sequence, 60): print >> out, segment return out.getvalue().strip()
def __init__(self, lines): """ @param lines: lines of a fasta file """ if not lines: raise AlignmentError('no fasta lines were provided') header_sequence_pairs = list(gen_header_sequence_pairs(lines)) self.headers = [header for header, sequence in header_sequence_pairs] nucleotide_sequences = [ seq.upper() for header, seq in header_sequence_pairs ] if not nucleotide_sequences: raise AlignmentError('no nucleotide sequences were found') for header in self.headers: if not header: raise AlignmentError('each sequence should have a header') if len(set(len(sequence) for sequence in nucleotide_sequences)) != 1: raise AlignmentError('not all sequences are the same length') for seq in nucleotide_sequences: invalid_states = set(seq) - set('ACGT-') if invalid_states: example_invalid_state = invalid_states.pop() raise CodonAlignmentError('invalid nucleotide: %s' % example_invalid_state) nucleotide_columns = zip(*nucleotide_sequences) if len(nucleotide_columns) % 3 != 0: raise CodonAlignmentError( 'the number of aligned nucleotide columns should be a multiple of three' ) gappy_codon_sequences = [ list(iterutils.chopped(seq, 3)) for seq in nucleotide_sequences ] if not gappy_codon_sequences: raise CodonAlignmentError('no codon sequences were found') observed_gappy_codons = set( itertools.chain.from_iterable(gappy_codon_sequences)) valid_gappy_codons = set(list(Codon.g_non_stop_codons) + ['---']) invalid_gappy_codons = observed_gappy_codons - valid_gappy_codons if invalid_gappy_codons: example_invalid_codon = invalid_gappy_codons.pop() raise CodonAlignmentError('invalid codon: %s' % example_invalid_codon) self.columns = [ col for col in zip(*gappy_codon_sequences) if '---' not in col ] if not self.columns: raise CodonAlignmentError('no ungapped codon columns were found') self.sequences = zip(*self.columns)
def get_alignment_string_interleaved(alignment): """ @param alignment: a fasta alignment object @return: an interleaved phylip alignment string """ chopped_sequences = [ iterutils.chopped(seq, 60) for seq in alignment.sequences] bands = zip(*chopped_sequences) out = StringIO() print >> out, len(alignment.headers), len(alignment.sequences[0]) lengths = [9] + [len(header) for header in alignment.headers] n = max(lengths) for header, segment in zip(alignment.headers, bands[0]): aligned_header = header.ljust(n) print >> out, '%s %s' % (aligned_header, segment) print >> out, '' for band in bands[1:]: for segment in band: print >> out, segment print >> out, '' return out.getvalue().strip()
def get_alignment_string_interleaved(alignment): """ @param alignment: a fasta alignment object @return: an interleaved phylip alignment string """ chopped_sequences = [ iterutils.chopped(seq, 60) for seq in alignment.sequences ] bands = zip(*chopped_sequences) out = StringIO() print >> out, len(alignment.headers), len(alignment.sequences[0]) lengths = [9] + [len(header) for header in alignment.headers] n = max(lengths) for header, segment in zip(alignment.headers, bands[0]): aligned_header = header.ljust(n) print >> out, '%s %s' % (aligned_header, segment) print >> out, '' for band in bands[1:]: for segment in band: print >> out, segment print >> out, '' return out.getvalue().strip()
def load(self, lines): """ @param lines: lines of nexus data """ # get the taxa, tree, and character lines taxa_lines = [] tree_lines = [] character_lines = [] current_array = None for line in iterutils.stripped_lines(lines): # Ignore an entire line that is a comment. # Nested comments and multi-line comments # are not correctly processed here. if line.startswith('[') and line.endswith(']'): self.add_comment(line[1:-1]) continue tokens = line.upper().split() if tokens == ['BEGIN', 'TAXA;']: current_array = taxa_lines elif tokens == ['BEGIN', 'TREES;']: current_array = tree_lines elif tokens == ['BEGIN', 'CHARACTERS;']: current_array = character_lines elif tokens == ['END;']: current_array = None elif current_array is not None: current_array.append(line) # assert that tree lines and character lines are present if not tree_lines: raise NexusError('TREES was not found') if not character_lines: raise NexusError('CHARACTERS was not found') # read the newick tree string nexus_tree_string = ''.join(tree_lines) if nexus_tree_string.count(';') != 1: raise NexusError( 'expected exactly one semicolon in the nexus TREES block') if nexus_tree_string.count('=') != 1: raise NexusError( 'expected exactly one equals sign in the nexus TREES block') offset = nexus_tree_string.find('=') newick_string = nexus_tree_string[offset + 1:] self.tree = Newick.parse(newick_string, Newick.NewickTree) # read the alignment matrix arr = [] found_matrix = False for line in character_lines: if line.upper().startswith('DIMENSIONS'): continue if line.upper().startswith('FORMAT'): continue if line.upper().startswith('MATRIX'): found_matrix = True continue if found_matrix: arr.append(line.replace(';', ' ')) if not arr: raise NexusError('no alignment was found') tokens = ' '.join(arr).split() if len(tokens) % 2 != 0: raise NexusError( 'expected the alignment to be a list of (taxon, sequence) pairs' ) alignment_out = StringIO() for header, sequence in iterutils.chopped(tokens, 2): sequence = sequence.upper() unexpected_letters = set(sequence) - set('ACGT') if unexpected_letters: raise NexusError('unexpected sequence character(s): %s' % list(unexpected_letters)) print >> alignment_out, '>%s' % header print >> alignment_out, sequence alignment_string = alignment_out.getvalue() self.alignment = Fasta.Alignment(StringIO(alignment_string))
def load(self, lines): """ @param lines: lines of nexus data """ # get the taxa, tree, and character lines taxa_lines = [] tree_lines = [] character_lines = [] current_array = None for line in iterutils.stripped_lines(lines): # Ignore an entire line that is a comment. # Nested comments and multi-line comments # are not correctly processed here. if line.startswith('[') and line.endswith(']'): self.add_comment(line[1:-1]) continue tokens = line.upper().split() if tokens == ['BEGIN', 'TAXA;']: current_array = taxa_lines elif tokens == ['BEGIN', 'TREES;']: current_array = tree_lines elif tokens == ['BEGIN', 'CHARACTERS;']: current_array = character_lines elif tokens == ['END;']: current_array = None elif current_array is not None: current_array.append(line) # assert that tree lines and character lines are present if not tree_lines: raise NexusError('TREES was not found') if not character_lines: raise NexusError('CHARACTERS was not found') # read the newick tree string nexus_tree_string = ''.join(tree_lines) if nexus_tree_string.count(';') != 1: raise NexusError('expected exactly one semicolon in the nexus TREES block') if nexus_tree_string.count('=') != 1: raise NexusError('expected exactly one equals sign in the nexus TREES block') offset = nexus_tree_string.find('=') newick_string = nexus_tree_string[offset+1:] self.tree = Newick.parse(newick_string, Newick.NewickTree) # read the alignment matrix arr = [] found_matrix = False for line in character_lines: if line.upper().startswith('DIMENSIONS'): continue if line.upper().startswith('FORMAT'): continue if line.upper().startswith('MATRIX'): found_matrix = True continue if found_matrix: arr.append(line.replace(';', ' ')) if not arr: raise NexusError('no alignment was found') tokens = ' '.join(arr).split() if len(tokens) % 2 != 0: raise NexusError('expected the alignment to be a list of (taxon, sequence) pairs') alignment_out = StringIO() for header, sequence in iterutils.chopped(tokens, 2): sequence = sequence.upper() unexpected_letters = set(sequence) - set('ACGT') if unexpected_letters: raise NexusError('unexpected sequence character(s): %s' % list(unexpected_letters)) print >> alignment_out, '>%s' % header print >> alignment_out, sequence alignment_string = alignment_out.getvalue() self.alignment = Fasta.Alignment(StringIO(alignment_string))