def test_msa_from_lines__two_entries_with_meta(): lines = [">seq1", "ACG", ">seq2 Second meta", "TGA"] expected = MSA( [FASTA("seq1", None, "ACG"), FASTA("seq2", "Second meta", "TGA")]) result = MSA.from_lines(lines) assert_equal(result, expected)
def test_msa_select__remove_one(): fa_1 = FASTA("A", None, "ACGT") fa_2 = FASTA("B", None, "GCTA") initial = MSA([fa_1, fa_2]) expected = MSA([fa_1]) result = initial.select(["A"]) assert result == expected
def test_msa_from_file__compressed_bz2(): expected = MSA([ FASTA("This_is_BZ_FASTA!", None, "CGTNA"), FASTA("This_is_ALSO_BZ_FASTA!", None, "ACGTN") ]) results = MSA.from_file(test_file("fasta_file.fasta.bz2")) assert_equal(results, expected)
def test_msa_exclude__remove_one(): fa_1 = FASTA("A", None, "ACGT") fa_2 = FASTA("B", None, "GCTA") initial = MSA([fa_1, fa_2]) expected = MSA([fa_1]) result = initial.exclude(["B"]) assert_equal(result, expected)
def test_fasta__from_lines__multiple_records(): lines = [">first\n", "TGTTCTCCACCGTGCACAAC\n", "CCTTCATCCA\n", ">Second XT:1:0\n", "GAGAGCTCAGCTAAC\n", ">Third\n", "CGCTGACCAAAAACGGACAG\n", "GGCATTCGGC\n"] expected = [FASTA("first", None, "TGTTCTCCACCGTGCACAACCCTTCATCCA"), FASTA("Second", "XT:1:0", "GAGAGCTCAGCTAAC"), FASTA("Third", None, "CGCTGACCAAAAACGGACAGGGCATTCGGC")] assert_list_equal(FASTA.from_lines(lines), expected)
def test_msa_join__three_msa(): expected = MSA(( FASTA("nc", None, "ACGTGAAAG"), FASTA("nm", None, "TGACTTGAG"), FASTA("miRNA", None, "UCAGACCAU"), )) result = MSA.join(_JOIN_MSA_1, _JOIN_MSA_2, _JOIN_MSA_3) assert result == expected
def test_msa_join__two_msa(): expected = MSA(( FASTA("nc", None, "ACGTGA"), FASTA("nm", None, "TGACTT"), FASTA("miRNA", None, "UCAGAC"), )) result = MSA.join(_JOIN_MSA_1, _JOIN_MSA_2) assert result == expected
def test_msa_reduce__multiple_empty_column__all_empty_column_are_removed(): fa_1 = FASTA("Name_A", "Meta_A", "-AnTN") fa_2 = FASTA("Name_B", "Meta_B", "NC-NN") initial = MSA([fa_1, fa_2]) fa_reduced_1 = FASTA("Name_A", "Meta_A", "AT") fa_reduced_2 = FASTA("Name_B", "Meta_B", "CN") expected = MSA([fa_reduced_1, fa_reduced_2]) assert_equal(initial.reduce(), expected)
def test_msa_filter_singletons__filter_by_third(): expected = MSA(( FASTA("Seq1", "Meta1", "nCGNTYCgTn"), FASTA("Seq2", "Meta2", "ACTA-WCCTG"), FASTA("Seq3", "Meta3", "NCGGTYCGTC"), )) result = _FILTER_MSA_1.filter_singletons("Seq1", ["Seq3"]) assert result == expected
def test_msa_reduce__one_empty_column__column_are_removed(): fa_1 = FASTA("Name_A", "Meta_A", "AnT") fa_2 = FASTA("Name_B", "Meta_B", "C-N") initial = MSA([fa_1, fa_2]) fa_reduced_1 = FASTA("Name_A", "Meta_A", "AT") fa_reduced_2 = FASTA("Name_B", "Meta_B", "CN") expected = MSA([fa_reduced_1, fa_reduced_2]) assert initial.reduce() == expected
def test_msa_from_file(func, tmp_path): filename = tmp_path / "test.fasta" with func(filename, "wt") as handle: handle.write(">This_is_FASTA!\nACGTN\n>This_is_ALSO_FASTA!\nCGTNA\n") assert MSA.from_file(filename) == MSA([ FASTA("This_is_FASTA!", None, "ACGTN"), FASTA("This_is_ALSO_FASTA!", None, "CGTNA"), ])
def test_sequentual_phy__different_length_names_2(): msa = MSA([ FASTA("Burchelli_4", None, "ACGTTGATAACCAGG"), FASTA("Donkey", None, "TGCAGAGTACGACGT"), ]) expected = """2 15 Burchelli_4 ACGTTGATAA CCAGG Donkey TGCAGAGTAC GACGT""" assert interleaved_phy(msa) == expected
def test_msa_repr(): msa = MSA(( FASTA("nc", None, "ACGTA"), FASTA("nm", "META", "TGAGT"), FASTA("miRNA", None, "UCAGA"), )) assert (str(msa) == "MSA(FASTA('miRNA', '', 'UCAGA'), " "FASTA('nc', '', 'ACGTA'), " "FASTA('nm', 'META', 'TGAGT'))")
def test_msa_to_file__complete_line_test(): msa = MSA([ FASTA("barfoo", None, "ACGATA" * 10 + "CGATAG" * 5), FASTA("foobar", None, "CGAATG" * 10 + "TGTCAT" * 5) ]) expected = ">barfoo\n%s\n%s\n" % ("ACGATA" * 10, "CGATAG" * 5) expected += ">foobar\n%s\n%s\n" % ("CGAATG" * 10, "TGTCAT" * 5) stringf = StringIO.StringIO() MSA.to_file(msa, stringf) assert_equal(stringf.getvalue(), expected)
def _read_mitochondria(self, tar_handle, filename): try: tar_handle.getmember(filename) except KeyError: # Missing MT file is allowed return None handle = tar_handle.extractfile(filename) results = {} for record in FASTA.from_lines(handle): record = FASTA(name=record.name, meta=record.meta, sequence=record.sequence.upper()) unexpected = set(record.sequence) - set("ACGTN-") if unexpected: unexpected = ", ".join(map(repr, sorted(unexpected))) raise ZonkeyDBError("Unexpected nucleotide in %s; only A, C, " "G, T, N, and - are allowed, not %s" % (unexpected, filename)) elif record.name in results: raise ZonkeyDBError("Duplicate sequence name in %s: %r" % (filename, record.name)) results[record.name] = record lengths = frozenset( len(record.sequence) for record in results.itervalues()) if not lengths: raise ZonkeyDBError("No mitochondrial sequences found in %r" % (filename, )) elif len(lengths) > 2: lengths = tuple(sorted(lengths)) lengths_s = "%s, and %s" % (", ".join(map( str, lengths[:-1])), lengths[-1]) raise ZonkeyDBError("At most two different sequence lengths " "expected for mitochondrial sequences, but " "found %i different lengths in %r: %s" % (len(lengths), filename, lengths_s)) elif len(lengths) != 1: # Unpadded sequences are allowed delta_len = max(lengths) - min(lengths) mito_padding = self.settings["MitoPadding"] if (delta_len != mito_padding): raise ZonkeyDBError("Length difference between mitochondrial " "sequences in %r does not match the " "padding; expected a difference of %i bp, " "but found a %i bp difference." % (filename, mito_padding, delta_len)) return results
def test_fasta__from_file(func, tmp_path): expected = [ FASTA("This_is_FASTA!", None, "ACGTN"), FASTA("This_is_ALSO_FASTA!", None, "CGTNA"), ] with func(tmp_path / "file", "wt") as handle: for item in expected: item.write(handle) assert list(FASTA.from_file(tmp_path / "file")) == expected
def _setup(self, _config, temp): self._update_ctl_file(source=self._control_file, destination=os.path.join(temp, "template.ctl")) os.symlink(os.path.abspath(self._trees_file), os.path.join(temp, "template.trees")) with open(os.path.join(temp, "template.seqs"), "w") as handle: for record in FASTA.from_file(self._sequence_file): if record.name not in self._exclude_groups: name = record.name sequence = record.sequence.upper() handle.write("%s\n" % (FASTA(name, None, sequence), ))
def _run(self, _config, temp): fasta_files = [] for (name, filename) in sorted(self._infiles.items()): fasta_files.append((name, pysam.FastaFile(filename))) for sequence_name in sorted(self._sequences): filename = os.path.join(temp, sequence_name + ".fasta") with open(filename, "w") as out_handle: for (sample, fasta_file) in fasta_files: sequence = fasta_file.fetch(sequence_name) fasta = FASTA(sample, sequence_name, sequence) fasta.write(out_handle)
def test_sequentual_phy__different_length_names_1(): msa = MSA([ FASTA("A_short_name", None, "ACGTTGATAACCAGG"), FASTA("Another_really_long_sequence_name_that_is_too_long", None, "TGCAGAGTACGACGT") ]) expected = """2 15 A_short_name ACGTTGATAA CCAGG Another_really_long_sequence_n TGCAGAGTAC GACGT""" print interleaved_phy(msa), expected assert_equal(interleaved_phy(msa), expected)
def test_index_and_collect_contigs__fai_files(tmp_path): fasta_file = tmp_path / "test.fasta" with fasta_file.open("wt") as handle: _TEST_FASTA_1_A.write(handle) fai_file = tmp_path / "test.fasta.fai" # Fai file should be created once, and then not modified FASTA.index_and_collect_contigs(fasta_file) stats_1 = fai_file.stat() FASTA.index_and_collect_contigs(fasta_file) stats_2 = fai_file.stat() assert stats_1 == stats_2
def filter_singletons(self, to_filter, filter_using): included, excluded, to_filter \ = self._group(filter_using, to_filter) sequence = list(to_filter.sequence) sequences = [record.sequence.upper() for record in included] for (index, nts) in enumerate(zip(*sequences)): current_nt = sequence[index].upper() if current_nt in "N-": continue allowed_nts = set() for allowed_nt in nts: if allowed_nt not in "N-": allowed_nts.update(NT_CODES[allowed_nt]) filtered_nts = frozenset(NT_CODES[current_nt]) & allowed_nts if not filtered_nts: filtered_nts = "N" genotype = encode_genotype(filtered_nts) if genotype != current_nt: sequence[index] = genotype.lower() new_record = FASTA(to_filter.name, to_filter.meta, "".join(sequence)) return MSA([new_record] + included + excluded)
def from_lines(cls, lines): """Parses a MSA from a file/list of lines, and returns a dictionary of names to sequences. If read_meta is True, meta information included after the first space in header of each sequence: >NAME META-INFORMATION SEQUENCE As suggested above, sequences are expected to be in FASTA format.""" return MSA(FASTA.from_lines(lines))
def truncate_sequences(sequences, name): result = {} to_len = len(sequences[name].sequence) for name, record in sequences.items(): result[name] = FASTA(name=record.name, meta=record.meta, sequence=record.sequence[:to_len]) return result
def test_msa_split__partial_group(): msa = MSA([FASTA("seq1", None, "ACGCA"), FASTA("seq2", None, "GAGTG")]) expected = { "1": MSA([FASTA("seq1", None, "AC"), FASTA("seq2", None, "GT")]), "2": MSA([FASTA("seq1", None, "CA"), FASTA("seq2", None, "AG")]), "3": MSA([FASTA("seq1", None, "G"), FASTA("seq2", None, "G")]) } assert_equal(msa.split("123"), expected)
def test_msa_split__three_groups(): msa = MSA([FASTA("seq1", None, "ACGCAT"), FASTA("seq2", None, "GAGTGA")]) expected = { "1": MSA([FASTA("seq1", None, "AC"), FASTA("seq2", None, "GT")]), "2": MSA([FASTA("seq1", None, "CA"), FASTA("seq2", None, "AG")]), "3": MSA([FASTA("seq1", None, "GT"), FASTA("seq2", None, "GA")]), } assert msa.split("123") == expected
def test_msa_split__empty_group(): msa = MSA([FASTA("seq1", None, "AC"), FASTA("seq2", None, "GA")]) expected = { "1": MSA([FASTA("seq1", None, "A"), FASTA("seq2", None, "G")]), "2": MSA([FASTA("seq1", None, "C"), FASTA("seq2", None, "A")]), "3": MSA([FASTA("seq1", None, ""), FASTA("seq2", None, "")]) } assert_equal(msa.split("123"), expected)
def _collect_fasta_contigs(filename, cache={}): if filename in cache: return cache[filename] if not os.path.exists(filename + ".fai"): log = logging.getLogger(__name__) log.info("Indexing %r; this may take a while", filename) cache[filename] = contigs = FASTA.index_and_collect_contigs(filename) return contigs
def __init__(self, options, filename): genome = list(FASTA.from_file(filename)) assert len(genome) == 1, len(genome) self._genome = genome[0].sequence.upper() self._sequence = None self._positions = None self._annotations = None self._mutate(options)
def _collect_fasta_contigs(filename, cache={}): if filename in cache: return cache[filename] if not os.path.exists(filename + ".fai"): print_info(" - Index does not exist for %r; this may " "take a while ..." % (filename,)) cache[filename] = contigs = dict(FASTA.index_and_collect_contigs(filename)) return contigs
def _validate_prefixes(makefiles): """Validates prefixes and regions-of-interest, including an implementation of the checks included in GATK, which require that the FASTA for the human genome is ordered 1 .. 23. This is required since GATK will not run with human genomes in a different order. """ already_validated = {} print_info(" - Validating prefixes ...") for makefile in makefiles: uses_gatk = makefile["Options"]["Features"]["RealignedBAM"] for prefix in makefile["Prefixes"].itervalues(): path = prefix["Path"] if path in already_validated: prefix["IndexFormat"] = already_validated[path]["IndexFormat"] continue # Must be set to a valid value, even if FASTA file does not exist prefix["IndexFormat"] = ".bai" if not os.path.exists(path): print_warn(" - Reference FASTA file does not exist:\n" " %r" % (path,)) continue elif not os.path.exists(path + ".fai"): print_info(" - Index does not exist for %r; this may " "take a while ..." % (path,)) try: contigs = FASTA.index_and_collect_contigs(path) except FASTAError, error: raise MakefileError("Error indexing FASTA:\n %s" % (error,)) # Implementation of GATK checks for the human genome _do_validate_hg_prefix(makefile, prefix, contigs, fatal=uses_gatk) contigs = dict(contigs) regions_of_interest = prefix.get("RegionsOfInterest", {}) for (name, fpath) in regions_of_interest.iteritems(): try: # read_bed_file returns iterator for _ in bedtools.read_bed_file(fpath, contigs=contigs): pass except (bedtools.BEDError, IOError), error: raise MakefileError("Error reading regions-of-" "interest %r for prefix %r:\n%s" % (name, prefix["Name"], error)) if max(contigs.itervalues()) > _BAM_MAX_SEQUENCE_LENGTH: print_warn(" - FASTA file %r contains sequences longer " "than %i! CSI index files will be used instead " "of BAI index files." % (path, _BAM_MAX_SEQUENCE_LENGTH)) prefix["IndexFormat"] = ".csi" already_validated[path] = prefix
def _validate_prefixes(makefiles): """Validates prefixes and regions-of-interest, including an implementation of the checks included in GATK, which require that the FASTA for the human genome is ordered 1 .. 23. This is required since GATK will not run with human genomes in a different order. """ already_validated = {} print_info(" - Validating prefixes ...") for makefile in makefiles: uses_gatk = makefile["Options"]["Features"]["RealignedBAM"] for prefix in makefile["Prefixes"].itervalues(): path = prefix["Path"] if path in already_validated: prefix["IndexFormat"] = already_validated[path]["IndexFormat"] continue # Must be set to a valid value, even if FASTA file does not exist prefix["IndexFormat"] = ".bai" if not os.path.exists(path): print_warn(" - Reference FASTA file does not exist:\n" " %r" % (path, )) continue elif not os.path.exists(path + ".fai"): print_info(" - Index does not exist for %r; this may " "take a while ..." % (path, )) try: contigs = FASTA.index_and_collect_contigs(path) except FASTAError, error: raise MakefileError("Error indexing FASTA:\n %s" % (error, )) # Implementation of GATK checks for the human genome _do_validate_hg_prefix(makefile, prefix, contigs, fatal=uses_gatk) contigs = dict(contigs) regions_of_interest = prefix.get("RegionsOfInterest", {}) for (name, fpath) in regions_of_interest.iteritems(): try: # read_bed_file returns iterator for _ in bedtools.read_bed_file(fpath, contigs=contigs): pass except (bedtools.BEDError, IOError), error: raise MakefileError("Error reading regions-of-" "interest %r for prefix %r:\n%s" % (name, prefix["Name"], error)) if max(contigs.itervalues()) > _BAM_MAX_SEQUENCE_LENGTH: print_warn(" - FASTA file %r contains sequences longer " "than %i! CSI index files will be used instead " "of BAI index files." % (path, _BAM_MAX_SEQUENCE_LENGTH)) prefix["IndexFormat"] = ".csi" already_validated[path] = prefix
def _read_mitochondria(self, tar_handle, filename): try: tar_handle.getmember(filename) except KeyError: # Missing MT file is allowed return None handle = tar_handle.extractfile(filename) results = {} for record in FASTA.from_lines(handle): record = FASTA(name=record.name, meta=record.meta, sequence=record.sequence.upper()) unexpected = set(record.sequence) - set("ACGTN-") if unexpected: unexpected = ", ".join(map(repr, sorted(unexpected))) raise ZonkeyDBError("Unexpected nucleotide in %s; only A, C, " "G, T, N, and - are allowed, not %s" % (unexpected, filename)) elif record.name in results: raise ZonkeyDBError("Duplicate sequence name in %s: %r" % (filename, record.name)) results[record.name] = record lengths = frozenset(len(record.sequence) for record in results.itervalues()) if not lengths: raise ZonkeyDBError("No mitochondrial sequences found in %r" % (filename,)) elif len(lengths) > 2: lengths = tuple(sorted(lengths)) lengths_s = "%s, and %s" % (", ".join(map(str, lengths[:-1])), lengths[-1]) raise ZonkeyDBError("At most two different sequence lengths " "expected for mitochondrial sequences, but " "found %i different lengths in %r: %s" % (len(lengths), filename, lengths_s)) elif len(lengths) != 1: # Unpadded sequences are allowed delta_len = max(lengths) - min(lengths) mito_padding = self.settings["MitoPadding"] if (delta_len != mito_padding): raise ZonkeyDBError("Length difference between mitochondrial " "sequences in %r does not match the " "padding; expected a difference of %i bp, " "but found a %i bp difference." % (filename, mito_padding, delta_len)) return results
def _setup(self, _config, temp): self._update_ctl_file(source = self._control_file, destination = os.path.join(temp, "template.ctl")) os.symlink(os.path.abspath(self._trees_file), os.path.join(temp, "template.trees")) with open(os.path.join(temp, "template.seqs"), "w") as handle: for record in FASTA.from_file(self._sequence_file): if record.name not in self._exclude_groups: name = record.name sequence = record.sequence.upper() handle.write("%s\n" % (FASTA(name, None, sequence),))
def test_fasta__from_lines__empty_record_name_only__first(): list(FASTA.from_lines([">fasta1\n", ">fasta2\n", "AGTC\n"]))
def test_fasta__from_lines_single_record(): lines = [">single\n", "TGTTCTCCACCGTGCACAAC\n", "CCTTCATCCA\n"] expected = [FASTA("single", None, "TGTTCTCCACCGTGCACAACCCTTCATCCA")] assert_list_equal(FASTA.from_lines(lines), expected)
def test_fasta__from_lines__no_records(): assert_list_equal(FASTA.from_lines([]), [])
def test_fasta__from_file__compressed_bz2(): expected = [FASTA("This_is_BZ_FASTA!", None, "CGTNA"), FASTA("This_is_ALSO_BZ_FASTA!", None, "ACGTN")] results = list(FASTA.from_file(test_file("fasta_file.fasta.bz2"))) assert_equal(results, expected)
def test_fasta__from_lines__empty_record_name_only__nothing_else(): list(FASTA.from_lines([">fasta1\n"]))
def test_fasta__from_lines__empty_name__with_others(): lines = [">\n", "ACGT\n", ">Foo\n", "ACGGTA\n"] list(FASTA.from_lines(lines))
def test_fasta__from_lines__empty_name__alone(): lines = [">\n", "ACGT\n"] list(FASTA.from_lines(lines))
def test_fasta__from_lines__missing_name__alone(): lines = ["ACGT\n"] list(FASTA.from_lines(lines))
def test_fasta__from_lines__empty_record_last(): lines = [">fasta1\n", "ACGT\n", ">fasta2\n"] list(FASTA.from_lines(lines))
def test_fasta__from_lines__empty_record__middle(): lines = [">fasta0\n", "ACGT\n", ">fasta1\n", ">fasta2\n", "AGTC\n"] list(FASTA.from_lines(lines))