def test_write_fasta_1(self): """Verify write_fasta() creates readable fasta formatted file""" fileio.write_fasta(self.fasta_dict_1, self.fasta_file) record = SeqIO.read(self.fasta_file, "fasta") id = list(self.fasta_dict_1.keys())[0] seq = self.fasta_dict_1[id] self.assertEqual(record.id, id) self.assertEqual(str(record.seq), seq)
def create_genes_fasta(alchemist, values, fasta_dir, db_name, mol_type="prot", verbose=False, data_cache=None): if data_cache is None: data_cache = {} fasta_path = fasta_dir.joinpath(".".join([db_name, "fasta"])) gs_to_ts = map_translations(alchemist, values) pdm_fileio.write_fasta(gs_to_ts, fasta_path) return fasta_path
def test_reintroduce_fasta_duplicates_2(self): """Verify reintroduce_duplicates() recognizes duplicate sequences""" fileio.write_fasta(self.test_fa_1_gs_to_ts, self.fasta_file) fileio.reintroduce_fasta_duplicates(self.test_fa_2_ts_to_gs, self.fasta_file) with self.fasta_file.open(mode="r") as filehandle: for record in SeqIO.parse(filehandle, "fasta"): with self.subTest(seq_id=record.id): translation = self.test_fa_2_gs_to_ts[record.id] self.assertTrue(translation is not None) self.assertEqual(str(record.seq), translation)
def test_write_fasta_2(self): """Verify write_fasta() can properly concatenate fasta files""" fileio.write_fasta(self.fasta_dict_2, self.fasta_file) records = SeqIO.parse(self.fasta_file, "fasta") keys = list(self.fasta_dict_2.keys()) for record in records: self.assertTrue(record.id in keys) seq = self.fasta_dict_2[record.id] self.assertEqual(str(record.seq), seq)
def write_phams_process(fasta_dir, aln_dir, pham, pham_translations): fasta_path = fasta_dir.joinpath("".join([str(pham), "_genes.fasta"])) aln_path = aln_dir.joinpath("".join([str(pham), "_genes.aln"])) gs_to_ts = {} for translation, gene_ids in pham_translations.items(): gs_to_ts[gene_ids[0]] = translation fileio.write_fasta(gs_to_ts, fasta_path) if len(pham_translations) > 1: run_clustalo(fasta_path, aln_path) fileio.reintroduce_fasta_duplicates(pham_translations, aln_path) fileio.reintroduce_fasta_duplicates(pham_translations, fasta_path)
def create_pham_hmms_process(working_dir, pham, ts_to_gs, name, M, seq_id, add_cons, seq_lim): aln_path = working_dir.joinpath(".".join([str(pham), "aln"])) hmm_path = working_dir.joinpath(".".join([str(pham), "hhm"])) gs_to_ts = dict() for ts, ids in ts_to_gs.items(): gs_to_ts[ts] = ids[0] pdm_fileio.write_fasta(gs_to_ts, aln_path) clustalo(aln_path, aln_path, None, None, "fasta", "fasta") hhmake(aln_path, hmm_path, name, add_cons, seq_lim, M, seq_id) return (pham, hmm_path)
def estimate_linker_sequence(source_cluster, target_centroid_seq, temp_dir): guidetree = source_cluster.GT curr_node = guidetree.clade while curr_node.is_bifurcating() and len(curr_node.clades) > 1: left_child = curr_node.clades[0] right_child = curr_node.clades[1] left_leaf = get_furthest_sequence(source_cluster, left_child) right_leaf = get_furthest_sequence(source_cluster, right_child) left_pid = left_leaf.comment.get("TargetIdentity") if left_pid is None: left_pid = alignment.calculate_levenshtein( left_leaf.comment["Sequence"], target_centroid_seq, identity=True) left_leaf.comment["TargetIdentity"] = left_pid pass right_pid = right_leaf.comment.get("TargetIdentity") if right_pid is None: right_pid = alignment.calculate_levenshtein( right_leaf.comment["Sequence"], target_centroid_seq, identity=True) right_leaf.comment["TargetIdentity"] = right_pid if left_pid > right_pid: curr_node = left_child else: curr_node = right_child linker = get_furthest_sequence(source_cluster, curr_node) if linker.comment.get("FilePath") is None: linker_path = temp_dir.joinpath(".".join([linker.name, "fasta"])) if not linker_path.is_file(): pdm_fileio.write_fasta({linker.name: linker.comment["Sequence"]}, linker_path) linker.comment["FilePath"] = linker_path return linker
def create_pham_alns_process(working_dir, pham, ts_to_gs, mat_out, tree_out, infile_type, outfile_type): aln_path = working_dir.joinpath(".".join([str(pham), "aln"])) mat_path = None if mat_out: mat_path = working_dir.joinpath(".".join([str(pham), "mat"])) tree_path = None if tree_out: tree_path = working_dir.joinpath(".".join([str(pham), "tree"])) gs_to_ts = dict() for ts, ids in ts_to_gs.items(): gs_to_ts[ids[0]] = ts pdm_fileio.write_fasta(gs_to_ts, aln_path) clustalo(aln_path, aln_path, mat_path, tree_path, infile_type, outfile_type) pdm_fileio.reintroduce_fasta_duplicates(ts_to_gs, aln_path) return (pham, aln_path)
def estimate_min_distance(source_cluster, target_cluster, temp_dir, data_dir, align=False): if source_cluster.GT is not None: source_leaf = estimate_linker_sequence(source_cluster, target_cluster.centroid_seq_str, temp_dir) source_linker = source_leaf.name source_linker_seq = source_leaf.comment["Sequence"] source_linker_path = source_leaf.comment["FilePath"] else: source_linker = source_cluster.CentroidID source_linker_seq = source_cluster.centroid_seq_str source_linker_path = temp_dir.joinpath(".".join( [source_linker, "fasta"])) if target_cluster.GT is not None: target_leaf = estimate_linker_sequence(target_cluster, source_cluster.centroid_seq_str, temp_dir) target_linker = target_leaf.name target_linker_seq = target_leaf.comment["Sequence"] target_linker_path = target_leaf.comment["FilePath"] else: target_linker = target_cluster.CentroidID target_linker_seq = target_cluster.centroid_seq_str target_linker_path = temp_dir.joinpath(".".join( [target_linker, "fasta"])) if align: source_len = len(source_linker_seq) target_len = len(target_linker_seq) if not source_linker_path.is_file(): pdm_fileio.write_fasta({source_linker: source_linker_seq}, source_linker_path) if not target_linker_path.is_file(): pdm_fileio.write_fasta({target_linker: target_linker_seq}, target_linker_path) pairwise_path = data_dir.joinpath("".join( [source_linker, "__", target_linker, ".fasta"])) error = True while error: try: linker_alignment = alignment.pairwise_align(source_linker_path, target_linker_path, pairwise_path, tool="needle") error = False except ApplicationError: time.sleep(0.2) if source_len > target_len: linker_pid = linker_alignment.annotations["identity"] / source_len else: linker_pid = linker_alignment.annotations["identity"] / target_len linker_pid = linker_pid * 100 else: linker_pid = alignment.calculate_levenshtein(source_linker_seq, target_linker_seq) return linker_pid
def write_centroids_threadtask(temp_dir, source_path, target_path, source_gs_to_ts, target_gs_to_ts): temp_dir.mkdir() pdm_fileio.write_fasta(source_gs_to_ts, source_path) pdm_fileio.write_fasta(target_gs_to_ts, target_path)