def test_sequentual_phy__different_length_names_2(): msa = MSA([FASTA("Burchelli_4", None, "ACGTTGATAACCAGG"), FASTA("Donkey", None, "TGCAGAGTACGACGT")]) expected = """2 15 Burchelli_4 ACGTTGATAA CCAGG Donkey TGCAGAGTAC GACGT""" print interleaved_phy(msa), expected assert_equal(interleaved_phy(msa), expected)
def test_sequentual_phy__different_length_names_1(): msa = MSA([FASTA("A_short_name", None, "ACGTTGATAACCAGG"), FASTA("Another_really_long_sequence_name_that_is_too_long", None, "TGCAGAGTACGACGT")]) expected = """2 15 A_short_name ACGTTGATAA CCAGG Another_really_long_sequence_n TGCAGAGTAC GACGT""" print interleaved_phy(msa), expected assert_equal(interleaved_phy(msa), expected)
def test_sequentual_phy__different_length_names_2(): msa = MSA([ FASTA("Burchelli_4", None, "ACGTTGATAACCAGG"), FASTA("Donkey", None, "TGCAGAGTACGACGT") ]) expected = """2 15 Burchelli_4 ACGTTGATAA CCAGG Donkey TGCAGAGTAC GACGT""" print interleaved_phy(msa), expected assert_equal(interleaved_phy(msa), expected)
def test_sequentual_phy__different_length_names_1(): msa = MSA([ FASTA("A_short_name", None, "ACGTTGATAACCAGG"), FASTA("Another_really_long_sequence_name_that_is_too_long", None, "TGCAGAGTACGACGT") ]) expected = """2 15 A_short_name ACGTTGATAA CCAGG Another_really_long_sequence_n TGCAGAGTAC GACGT""" print interleaved_phy(msa), expected assert_equal(interleaved_phy(msa), expected)
def test_interleaved_phy__with_flag(): expected = """2 15 I seq1 ACGTTGATAA CCAGG seq2 TGCAGAGTAC GACGT""" assert_equal(interleaved_phy(_MSA_SHORT_SEQUENCES, add_flag=True), expected)
def main(argv): args = parse_args(argv) data = database.ZonkeyDB(args.database) sequences = data.mitochondria log = logging.getLogger(__name__) try: handle = pysam.AlignmentFile(args.bam) except (IOError, ValueError) as error: log.error("Error reading BAM file: %s", error) return 1 with handle: bam_info = data.validate_bam_handle(handle) if bam_info is None: return 1 elif not bam_info.is_mitochondrial: log.error("BAM does not contain any known mitochondrial sequence") return 1 reference = sequences[bam_info.mt_contig] stats, majority = majority_sequence( handle, padding=bam_info.mt_padding, contig_name=bam_info.mt_contig, contig_length=bam_info.mt_length, ) sequences["Sample"] = FASTA( name="Sample", meta=None, sequence=align_majority(reference.sequence, majority), ) # Truncate all sequences to match the (now) unpadded sample sequence sequences = truncate_sequences(sequences, "Sample") sequences = filter_sequences(sequences) with open(args.output_prefix + ".summary", "w") as handle: stats["filename"] = os.path.abspath(args.bam) for key, value in sorted(stats.items()): handle.write("{}: {}\n".format(key, value)) with open(args.output_prefix + ".phy", "w") as handle: handle.write(interleaved_phy(sequences_to_msa(sequences))) with open(args.output_prefix + ".fasta", "w") as handle: for key, record in sorted(sequences.items()): handle.write(">{}\n".format(key)) for line in fragment(60, record.sequence): handle.write("{}\n".format(line)) return 0
def test_interleaved_phy__multi_line_sequences(): expected = """2 140 seq1 CGGATCTGCT CCTCCACTGG CCACGTTTAC TGTCCCCCAA CCGTTCGTCC seq2 AGTTGAAGAG GCGGAACGTT TGTAAACCGC GCTAACGTAG TTCTACAACC CGACCTAGTT ATACTTCTTA GCAAGGTGTA AAACCAGAGA TTGAGGTTAT AACGTTCCTA AGCCACCCGG TTCGAAGGAA CAACTGGTCG CCATAATTAG GCGAAACGAT AGTGCACTAA ATCAGTTATT AAATTACCGC GCCCCGACAG GGTCAGGTGC GCCCCTGTAA ATAATTAGAT""" assert_equal(interleaved_phy(_MSA_LONG_SEQUENCES), expected)
def _run(self, _config, temp): merged_msas = [] for (name, files_dd) in sorted(self._infiles.iteritems()): partitions = files_dd["partitions"] msas = dict((key, []) for key in partitions) for filename in files_dd["filenames"]: msa = MSA.from_file(filename) if self._excluded: msa = msa.exclude(self._excluded) for (key, msa_part) in msa.split(partitions).iteritems(): msas[key].append(msa_part) msas.pop("X", None) for (key, msa_parts) in sorted(msas.iteritems()): merged_msa = MSA.join(*msa_parts) if self._reduce: merged_msa = merged_msa.reduce() if merged_msa is not None: merged_msas.append(("%s_%s" % (name, key), merged_msa)) out_fname_phy = reroot_path(temp, self._out_prefix + ".phy") with open(out_fname_phy, "w") as output_phy: final_msa = MSA.join(*(msa for (_, msa) in merged_msas)) output_phy.write(interleaved_phy(final_msa)) partition_end = 0 out_fname_parts = reroot_path(temp, self._out_prefix + ".partitions") with open(out_fname_parts, "w") as output_part: for (name, msa) in merged_msas: length = msa.seqlen() output_part.write("DNA, %s = %i-%i\n" % (name, partition_end + 1, partition_end + length)) partition_end += length
def _run(self, _config, temp): merged_msas = [] for (name, files_dd) in sorted(self._infiles.items()): partitions = files_dd["partitions"] msas = dict((key, []) for key in partitions) for filename in files_dd["filenames"]: msa = MSA.from_file(filename) if self._excluded: msa = msa.exclude(self._excluded) for (key, msa_part) in msa.split(partitions).items(): msas[key].append(msa_part) msas.pop("X", None) for (key, msa_parts) in sorted(msas.items()): merged_msa = MSA.join(*msa_parts) if self._reduce: merged_msa = merged_msa.reduce() if merged_msa is not None: merged_msas.append(("%s_%s" % (name, key), merged_msa)) out_fname_phy = reroot_path(temp, self._out_prefix + ".phy") with open(out_fname_phy, "w") as output_phy: final_msa = MSA.join(*(msa for (_, msa) in merged_msas)) output_phy.write(interleaved_phy(final_msa)) partition_end = 0 out_fname_parts = reroot_path(temp, self._out_prefix + ".partitions") with open(out_fname_parts, "w") as output_part: for (name, msa) in merged_msas: length = msa.seqlen() output_part.write( "DNA, %s = %i-%i\n" % (name, partition_end + 1, partition_end + length)) partition_end += length
def test_interleaved_phy__different_lengths(): with patch("paleomix.common.formats.msa.MSA.validate", wrap=MSA.validate) as mock: interleaved_phy(_MSA_MEDIUM_NAMES) mock.assert_called_once()
def test_interleaved_phy__different_lengths(): _mock = flexmock(MSA).should_receive('validate').at_least.once interleaved_phy(_MSA_MEDIUM_NAMES)
def _run(self, _config, temp): msa = MSA.join(*(MSA.from_file(filename) for filename in sorted(self.input_files))) with open(reroot_path(temp, self._out_phy), "w") as output: output.write(interleaved_phy(msa, add_flag = self._add_flag))
def test_interleaved_phy__medium_names(): expected = """2 15 A_really_long_sequence ACGTTGATAA CCAGG Another_real_long_one! TGCAGAGTAC GACGT""" assert_equal(interleaved_phy(_MSA_MEDIUM_NAMES), expected)
def test_interleaved_phy__long_names(): expected = """2 15 A_really_long_sequence_name_th ACGTTGATAA CCAGG Another_really_long_sequence_n TGCAGAGTAC GACGT""" assert_equal(interleaved_phy(_MSA_LONG_NAMES), expected)
def test_interleaved_phy__short_sequences(): expected = """2 44 seq1 ACGTTGATAA CCAGGAGGGA TTCGCGATTG GTGGTAACGT AGCC seq2 TGCAGAGTAC GACGTCTCCT AGATCCTGGA CAATTTAAAC CGAA""" assert_equal(interleaved_phy(_MSA_MEDIUM_SEQUENCES), expected)
sequences["Sample"] = FASTA(name="Sample", meta=None, sequence=align_majority(reference.sequence, majority)) # Truncate all sequences to match the (now) unpadded sample sequence sequences = truncate_sequences(sequences, "Sample") sequences = filter_sequences(sequences) with open(args.output_prefix + ".summary", "w") as handle: stats["filename"] = os.path.abspath(args.bam) for key, value in sorted(stats.iteritems()): handle.write("{}: {}\n".format(key, value)) with open(args.output_prefix + ".phy", "w") as handle: handle.write(interleaved_phy(sequences_to_msa(sequences))) with open(args.output_prefix + ".fasta", "w") as handle: for key, record in sorted(sequences.iteritems()): handle.write(">{}\n".format(key)) for line in fragment(60, record.sequence): handle.write("{}\n".format(line)) return 0 if __name__ == '__main__': sys.exit(main(sys.argv[1:]))
def _run(self, _config, temp): msa = MSA.join(*(MSA.from_file(filename) for filename in sorted(self.input_files))) with open(reroot_path(temp, self._out_phy), "w") as output: output.write(interleaved_phy(msa, add_flag=self._add_flag))