def _teardown(self, config, temp): # Validate output from MAFFT output_file = reroot_path(temp, self._output_file) try: MSA.from_file(output_file) except MSAError, error: raise NodeError("Invalid MSA produced by MAFFT:\n%s" % (error,))
def _read_sequences(filenames): results = {} for filename in filenames: results[filename] = MSA.from_file(filename) MSA.validate(*results.values()) return results.iteritems()
def _run(self, _config, temp): alignment = MSA.from_file(self._input_file) for (to_filter, groups) in self._filter_by.iteritems(): alignment = alignment.filter_singletons(to_filter, groups) temp_filename = fileutils.reroot_path(temp, self._output_file) with open(temp_filename, "w") as handle: alignment.to_file(handle) fileutils.move_file(temp_filename, self._output_file)
def _run(self, _config, temp): # Read and check that MSAs share groups msas = [MSA.from_file(filename) for filename in sorted(self.input_files)] MSA.validate(*msas) blocks = [] for msa in msas: blocks.append(sequential_phy(msa, add_flag = self._add_flag)) with open(reroot_path(temp, self._out_phy), "w") as output: output.write("\n\n".join(blocks))
def _is_sufficently_covered(filepath, min_coverage): msa = MSA.from_file(filepath) if msa.seqlen() % 3: return False total_bases_not_covered = 0 for fasta_record in msa: total_bases_not_covered += fasta_record.sequence.upper().count("N") total_bases_not_covered += fasta_record.sequence.count("-") total_bases = float(len(msa) * msa.seqlen()) frac_covered = 1.0 - total_bases_not_covered / total_bases return frac_covered >= min_coverage
def _run(self, _config, temp): # Read and check that MSAs share groups msas = [ MSA.from_file(filename) for filename in sorted(self.input_files) ] MSA.validate(*msas) blocks = [] for msa in msas: blocks.append(sequential_phy(msa, add_flag=self._add_flag)) with open(reroot_path(temp, self._out_phy), "w") as output: output.write("\n\n".join(blocks))
def _run(self, _config, temp): merged_msas = [] for (name, files_dd) in sorted(self._infiles.iteritems()): partitions = files_dd["partitions"] msas = dict((key, []) for key in partitions) for filename in files_dd["filenames"]: msa = MSA.from_file(filename) if self._excluded: msa = msa.exclude(self._excluded) for (key, msa_part) in msa.split(partitions).iteritems(): msas[key].append(msa_part) msas.pop("X", None) for (key, msa_parts) in sorted(msas.iteritems()): merged_msa = MSA.join(*msa_parts) if self._reduce: merged_msa = merged_msa.reduce() if merged_msa is not None: merged_msas.append(("%s_%s" % (name, key), merged_msa)) out_fname_phy = reroot_path(temp, self._out_prefix + ".phy") with open(out_fname_phy, "w") as output_phy: final_msa = MSA.join(*(msa for (_, msa) in merged_msas)) output_phy.write(interleaved_phy(final_msa)) partition_end = 0 out_fname_parts = reroot_path(temp, self._out_prefix + ".partitions") with open(out_fname_parts, "w") as output_part: for (name, msa) in merged_msas: length = msa.seqlen() output_part.write("DNA, %s = %i-%i\n" % (name, partition_end + 1, partition_end + length)) partition_end += length
def _run(self, _config, temp): merged_msas = [] for (name, files_dd) in sorted(self._infiles.iteritems()): partitions = files_dd["partitions"] msas = dict((key, []) for key in partitions) for filename in files_dd["filenames"]: msa = MSA.from_file(filename) if self._excluded: msa = msa.exclude(self._excluded) for (key, msa_part) in msa.split(partitions).iteritems(): msas[key].append(msa_part) msas.pop("X", None) for (key, msa_parts) in sorted(msas.iteritems()): merged_msa = MSA.join(*msa_parts) if self._reduce: merged_msa = merged_msa.reduce() if merged_msa is not None: merged_msas.append(("%s_%s" % (name, key), merged_msa)) out_fname_phy = reroot_path(temp, self._out_prefix + ".phy") with open(out_fname_phy, "w") as output_phy: final_msa = MSA.join(*(msa for (_, msa) in merged_msas)) output_phy.write(interleaved_phy(final_msa)) partition_end = 0 out_fname_parts = reroot_path(temp, self._out_prefix + ".partitions") with open(out_fname_parts, "w") as output_part: for (name, msa) in merged_msas: length = msa.seqlen() output_part.write( "DNA, %s = %i-%i\n" % (name, partition_end + 1, partition_end + length)) partition_end += length
def _run(self, _config, temp): msa = MSA.join(*(MSA.from_file(filename) for filename in sorted(self.input_files))) with open(reroot_path(temp, self._out_phy), "w") as output: output.write(interleaved_phy(msa, add_flag=self._add_flag))
def _run(self, _config, temp): msa = MSA.join(*(MSA.from_file(filename) for filename in sorted(self.input_files))) with open(reroot_path(temp, self._out_phy), "w") as output: output.write(interleaved_phy(msa, add_flag = self._add_flag))
def test_msa_from_file__compressed_bz2(): expected = MSA([FASTA("This_is_BZ_FASTA!", None, "CGTNA"), FASTA("This_is_ALSO_BZ_FASTA!", None, "ACGTN")]) results = MSA.from_file("tests/data/fasta_file.fasta.bz2") assert_equal(results, expected)