def _read_sequences(filenames): results = {} for filename in filenames: results[filename] = MSA.from_file(filename) MSA.validate(*results.values()) return results.iteritems()
def interleaved_phy(msa, add_flag = False, max_name_length = _MAX_NAME_LENGTH): MSA.validate(msa) header = "%i %i" % (len(msa), msa.seqlen()) if add_flag: header += " I" result = [header, ""] padded_len = min(max_name_length, max(len(name) for name in msa.names())) + 2 padded_len -= padded_len % -(_BLOCK_SIZE + _BLOCK_SPACING) + _BLOCK_SPACING streams = [] spacing = " " * _BLOCK_SPACING for record in sorted(msa): name = record.name[:max_name_length] padding = (padded_len - len(name)) * " " lines = [] line = [name, padding] for block in grouper(_BLOCK_SIZE, record.sequence, fillvalue = ""): block = "".join(block) if sum(len(segment) for segment in line) >= _LINE_SIZE: lines.append("".join(line)) line = [block] else: line.extend((spacing, block)) lines.append("".join(line)) streams.append(lines) for rows in zip(*streams): result.extend(row for row in rows) result.append("") result.pop() return "\n".join(result)
def test_msa_exclude__remove_one(): fa_1 = FASTA("A", None, "ACGT") fa_2 = FASTA("B", None, "GCTA") initial = MSA([fa_1, fa_2]) expected = MSA([fa_1]) result = initial.exclude(["B"]) assert_equal(result, expected)
def _teardown(self, config, temp): # Validate output from MAFFT output_file = reroot_path(temp, self._output_file) try: MSA.from_file(output_file) except MSAError, error: raise NodeError("Invalid MSA produced by MAFFT:\n%s" % (error,))
def test_msa_reduce__multiple_empty_column__all_empty_column_are_removed(): fa_1 = FASTA("Name_A", "Meta_A", "-AnTN") fa_2 = FASTA("Name_B", "Meta_B", "NC-NN") initial = MSA([fa_1, fa_2]) fa_reduced_1 = FASTA("Name_A", "Meta_A", "AT") fa_reduced_2 = FASTA("Name_B", "Meta_B", "CN") expected = MSA([fa_reduced_1, fa_reduced_2]) assert_equal(initial.reduce(), expected)
def test_msa_to_file__complete_line_test(): msa = MSA([FASTA("barfoo", None, "ACGATA" * 10 + "CGATAG" * 5), FASTA("foobar", None, "CGAATG" * 10 + "TGTCAT" * 5)]) expected = ">barfoo\n%s\n%s\n" % ("ACGATA" * 10, "CGATAG" * 5) expected += ">foobar\n%s\n%s\n" % ("CGAATG" * 10, "TGTCAT" * 5) stringf = StringIO.StringIO() MSA.to_file(msa, stringf) assert_equal(stringf.getvalue(), expected)
def test_msa_split_msa__two_groups(): msa = MSA([FASTA("seq1", None, "ACGCAT"), FASTA("seq2", None, "GAGTGA")]) expected = {"1" : MSA([FASTA("seq1", None, "ACCA"), FASTA("seq2", None, "GATG")]), "2" : MSA([FASTA("seq1", None, "GT"), FASTA("seq2", None, "GA")])} assert_equal(msa.split("112"), expected)
def test_msa_split__partial_group(): msa = MSA([FASTA("seq1", None, "ACGCA"), FASTA("seq2", None, "GAGTG")]) expected = {"1" : MSA([FASTA("seq1", None, "AC"), FASTA("seq2", None, "GT")]), "2" : MSA([FASTA("seq1", None, "CA"), FASTA("seq2", None, "AG")]), "3" : MSA([FASTA("seq1", None, "G"), FASTA("seq2", None, "G")])} assert_equal(msa.split("123"), expected)
def test_msa_split__empty_group(): msa = MSA([FASTA("seq1", None, "AC"), FASTA("seq2", None, "GA")]) expected = {"1" : MSA([FASTA("seq1", None, "A"), FASTA("seq2", None, "G")]), "2" : MSA([FASTA("seq1", None, "C"), FASTA("seq2", None, "A")]), "3" : MSA([FASTA("seq1", None, ""), FASTA("seq2", None, "")])} assert_equal(msa.split("123"), expected)
def _run(self, _config, temp): # Read and check that MSAs share groups msas = [MSA.from_file(filename) for filename in sorted(self.input_files)] MSA.validate(*msas) blocks = [] for msa in msas: blocks.append(sequential_phy(msa, add_flag = self._add_flag)) with open(reroot_path(temp, self._out_phy), "w") as output: output.write("\n\n".join(blocks))
def _run(self, _config, temp): # Read and check that MSAs share groups msas = [ MSA.from_file(filename) for filename in sorted(self.input_files) ] MSA.validate(*msas) blocks = [] for msa in msas: blocks.append(sequential_phy(msa, add_flag=self._add_flag)) with open(reroot_path(temp, self._out_phy), "w") as output: output.write("\n\n".join(blocks))
def sequential_phy(msa, add_flag = False, max_name_length = _MAX_NAME_LENGTH): MSA.validate(msa) header = "%i %i" % (len(msa), msa.seqlen()) if add_flag: header += " S" spacing = " " * _BLOCK_SPACING result = [header, ""] for record in sorted(msa): result.append(record.name[:max_name_length]) blocks = grouper(_BLOCK_SIZE, record.sequence, fillvalue = "") lines = grouper(_NUM_BLOCKS, blocks) for line in lines: result.append(spacing.join("".join(block) for block in line if block)) return "\n".join(result)
def _run(self, _config, temp): alignment = MSA.from_file(self._input_file) for (to_filter, groups) in self._filter_by.iteritems(): alignment = alignment.filter_singletons(to_filter, groups) temp_filename = fileutils.reroot_path(temp, self._output_file) with open(temp_filename, "w") as handle: alignment.to_file(handle) fileutils.move_file(temp_filename, self._output_file)
def sequential_phy(msa, add_flag=False, max_name_length=_MAX_NAME_LENGTH): MSA.validate(msa) header = "%i %i" % (len(msa), msa.seqlen()) if add_flag: header += " S" spacing = " " * _BLOCK_SPACING result = [header, ""] for record in sorted(msa): result.append(record.name[:max_name_length]) blocks = grouper(_BLOCK_SIZE, record.sequence, fillvalue="") lines = grouper(_NUM_BLOCKS, blocks) for line in lines: result.append( spacing.join("".join(block) for block in line if block)) return "\n".join(result)
def test_sequentual_phy__different_length_names_2(): msa = MSA([ FASTA("Burchelli_4", None, "ACGTTGATAACCAGG"), FASTA("Donkey", None, "TGCAGAGTACGACGT") ]) expected = \ """2 15 Burchelli_4 ACGTTGATAA CCAGG Donkey TGCAGAGTAC GACGT""" print interleaved_phy(msa), expected assert_equal(interleaved_phy(msa), expected)
def _is_sufficently_covered(filepath, min_coverage): msa = MSA.from_file(filepath) if msa.seqlen() % 3: return False total_bases_not_covered = 0 for fasta_record in msa: total_bases_not_covered += fasta_record.sequence.upper().count("N") total_bases_not_covered += fasta_record.sequence.count("-") total_bases = float(len(msa) * msa.seqlen()) frac_covered = 1.0 - total_bases_not_covered / total_bases return frac_covered >= min_coverage
def test_sequentual_phy__different_length_names_1(): msa = MSA([ FASTA("A_short_name", None, "ACGTTGATAACCAGG"), FASTA("Another_really_long_sequence_name_that_is_too_long", None, "TGCAGAGTACGACGT") ]) expected = \ """2 15 A_short_name ACGTTGATAA CCAGG Another_really_long_sequence_n TGCAGAGTAC GACGT""" print interleaved_phy(msa), expected assert_equal(interleaved_phy(msa), expected)
def _run(self, _config, temp): merged_msas = [] for (name, files_dd) in sorted(self._infiles.iteritems()): partitions = files_dd["partitions"] msas = dict((key, []) for key in partitions) for filename in files_dd["filenames"]: msa = MSA.from_file(filename) if self._excluded: msa = msa.exclude(self._excluded) for (key, msa_part) in msa.split(partitions).iteritems(): msas[key].append(msa_part) msas.pop("X", None) for (key, msa_parts) in sorted(msas.iteritems()): merged_msa = MSA.join(*msa_parts) if self._reduce: merged_msa = merged_msa.reduce() if merged_msa is not None: merged_msas.append(("%s_%s" % (name, key), merged_msa)) out_fname_phy = reroot_path(temp, self._out_prefix + ".phy") with open(out_fname_phy, "w") as output_phy: final_msa = MSA.join(*(msa for (_, msa) in merged_msas)) output_phy.write(interleaved_phy(final_msa)) partition_end = 0 out_fname_parts = reroot_path(temp, self._out_prefix + ".partitions") with open(out_fname_parts, "w") as output_part: for (name, msa) in merged_msas: length = msa.seqlen() output_part.write("DNA, %s = %i-%i\n" % (name, partition_end + 1, partition_end + length)) partition_end += length
def interleaved_phy(msa, add_flag=False, max_name_length=_MAX_NAME_LENGTH): MSA.validate(msa) header = "%i %i" % (len(msa), msa.seqlen()) if add_flag: header += " I" result = [header, ""] padded_len = min(max_name_length, max(len(name) for name in msa.names())) + 2 padded_len -= padded_len % -(_BLOCK_SIZE + _BLOCK_SPACING) + _BLOCK_SPACING streams = [] spacing = " " * _BLOCK_SPACING for record in sorted(msa): name = record.name[:max_name_length] padding = (padded_len - len(name)) * " " lines = [] line = [name, padding] for block in grouper(_BLOCK_SIZE, record.sequence, fillvalue=""): block = "".join(block) if sum(len(segment) for segment in line) >= _LINE_SIZE: lines.append("".join(line)) line = [block] else: line.extend((spacing, block)) lines.append("".join(line)) streams.append(lines) for rows in zip(*streams): result.extend(row for row in rows) result.append("") result.pop() return "\n".join(result)
def _run(self, _config, temp): merged_msas = [] for (name, files_dd) in sorted(self._infiles.iteritems()): partitions = files_dd["partitions"] msas = dict((key, []) for key in partitions) for filename in files_dd["filenames"]: msa = MSA.from_file(filename) if self._excluded: msa = msa.exclude(self._excluded) for (key, msa_part) in msa.split(partitions).iteritems(): msas[key].append(msa_part) msas.pop("X", None) for (key, msa_parts) in sorted(msas.iteritems()): merged_msa = MSA.join(*msa_parts) if self._reduce: merged_msa = merged_msa.reduce() if merged_msa is not None: merged_msas.append(("%s_%s" % (name, key), merged_msa)) out_fname_phy = reroot_path(temp, self._out_prefix + ".phy") with open(out_fname_phy, "w") as output_phy: final_msa = MSA.join(*(msa for (_, msa) in merged_msas)) output_phy.write(interleaved_phy(final_msa)) partition_end = 0 out_fname_parts = reroot_path(temp, self._out_prefix + ".partitions") with open(out_fname_parts, "w") as output_part: for (name, msa) in merged_msas: length = msa.seqlen() output_part.write( "DNA, %s = %i-%i\n" % (name, partition_end + 1, partition_end + length)) partition_end += length
from nose.tools import assert_equal from pypeline.common.formats.phylip import \ sequential_phy, \ interleaved_phy from pypeline.common.formats.msa import \ MSA from pypeline.common.formats.fasta import \ FASTA _MSA_SHORT_SEQUENCES = \ MSA([FASTA("seq1", None, "ACGTTGATAACCAGG"), FASTA("seq2", None, "TGCAGAGTACGACGT")]) _MSA_MEDIUM_SEQUENCES = \ MSA([FASTA("seq1", None, "ACGTTGATAACCAGGAGGGATTCGCGATTGGTGGTAACGTAGCC"), FASTA("seq2", None, "TGCAGAGTACGACGTCTCCTAGATCCTGGACAATTTAAACCGAA")]) _MSA_LONG_SEQUENCES = \ MSA([FASTA("seq1", None, "CGGATCTGCTCCTCCACTGGCCACGTTTACTGTCCCCCAACCGTT" \ "CGTCCCGACCTAGTTATACTTCTTAGCAAGGTGTAAAACCAGAGATTGAGGTTATAACG" \ "TTCCTAATCAGTTATTAAATTACCGCGCCCCGACAG"), FASTA("seq2", None, "AGTTGAAGAGGCGGAACGTTTGTAAACCGCGCTAACGTAGTTCTA" \ "CAACCAGCCACCCGGTTCGAAGGAACAACTGGTCGCCATAATTAGGCGAAACGATAGTG" \ "CACTAAGGTCAGGTGCGCCCCTGTAAATAATTAGAT")]) _MSA_MEDIUM_NAMES = \ MSA([FASTA("A_really_long_sequence", None, "ACGTTGATAACCAGG"), FASTA("Another_real_long_one!", None, "TGCAGAGTACGACGT")]) _MSA_LONG_NAMES = \
def test_msa_join__three_msa(): expected = MSA((FASTA("nc", None, "ACGTGAAAG"), FASTA("nm", None, "TGACTTGAG"), FASTA("miRNA", None, "UCAGACCAU"))) result = MSA.join(_JOIN_MSA_1, _JOIN_MSA_2, _JOIN_MSA_3) assert_equal(result, expected)
def test_msa_join__missing_arguments(): MSA.join()
def test_msa_split_msa__single_group(): msa = MSA([FASTA("seq1", None, "ACGCAT"), FASTA("seq2", None, "GAGTGA")]) expected = {'1' : copy.copy(msa)} assert_equal(msa.split("111"), expected)
def test_msa_reduce__only_empty_column__none_is_returned(): fa_1 = FASTA("Name_A", "Meta_A", "---Nn") fa_2 = FASTA("Name_B", "Meta_B", "Nn--N") initial = MSA([fa_1, fa_2]) assert_equal(initial.reduce(), None)
def test_msa_from_lines__two_entries_with_meta(): lines = [">seq1", "ACG", ">seq2 Second meta", "TGA"] expected = MSA([FASTA("seq1", None, "ACG"), FASTA("seq2", "Second meta", "TGA")]) result = MSA.from_lines(lines) assert_equal(result, expected)
def test_msa_join__two_msa(): expected = MSA((FASTA("nc", None, "ACGTGA"), FASTA("nm", None, "TGACTT"), FASTA("miRNA", None, "UCAGAC"))) result = MSA.join(_JOIN_MSA_1, _JOIN_MSA_2) assert_equal(result, expected)
def test_msa_from_lines__duplicate_names(): MSA.from_lines([">seq1", "ACG", ">seq1", "TGA"])
def _run(self, _config, temp): msa = MSA.join(*(MSA.from_file(filename) for filename in sorted(self.input_files))) with open(reroot_path(temp, self._out_phy), "w") as output: output.write(interleaved_phy(msa, add_flag = self._add_flag))
def test_msa_from_lines__mismatched_lengths(): MSA.from_lines([">seq1", "ACG", ">seq2", "TGAN"])
def test_msa_from_lines__empty_name(): MSA.from_lines([">", "ACG", ">seq1", "TGAN"])
def test_msa_from_file__compressed_bz2(): expected = MSA([FASTA("This_is_BZ_FASTA!", None, "CGTNA"), FASTA("This_is_ALSO_BZ_FASTA!", None, "ACGTN")]) results = MSA.from_file("tests/data/fasta_file.fasta.bz2") assert_equal(results, expected)
def test_msa_join__single_msa(): result = MSA.join(_JOIN_MSA_1) assert_equal(result, _JOIN_MSA_1)
def _run(self, _config, temp): msa = MSA.join(*(MSA.from_file(filename) for filename in sorted(self.input_files))) with open(reroot_path(temp, self._out_phy), "w") as output: output.write(interleaved_phy(msa, add_flag=self._add_flag))
def test_msa_from_lines__single_entry_with_meta(): lines = [">seq1 Meta info", "ACG"] expected = MSA([FASTA("seq1", "Meta info", "ACG")]) result = MSA.from_lines(lines) assert_equal(result, expected)
def test_msa_from_lines__single_entry(): lines = [">seq1", "ACG"] result = MSA([FASTA("seq1", None, "ACG")]) assert_equal(MSA.from_lines(lines), result)
def test_msa__seqlen__corresponds_to_sequence_lengths(): msa = MSA((FASTA("seq1", None, "ACGCGTATGCATGCCGA"), FASTA("seq2", None, "TGAACACACAGTAGGAT"))) assert_equal(msa.seqlen(), 17)
def test_msa_split_msa__no_split_by(): msa = MSA([FASTA("seq1", None, "ACG"), FASTA("seq2", None, "GAT")]) msa.split(split_by = "")