def compute_true_negative_hmm_scores(self): count = 1 print("\n") for group, file_list in self.group_to_file_list.items(): for file_name in file_list: hmm = self.group_by_file_to_hmm[group][file_name] fasta_hash = self.make_cluster_specific_TN_hash( group, file_name) if fasta_hash and len(fasta_hash) >= 10: scoring_obj = ScoreObject(fasta_hash, hmm) score_hash = scoring_obj.bulk_score_computation() self.group_by_file_to_twin_score_obj[group][ file_name] = scoring_obj if len(fasta_hash) >= 20: self.group_by_file_to_twin_hmm[group][ file_name] = scoring_obj.compute_full_phmm( os.path.join(output_dir, file_name + ".TN_hmm")) print_progress( count, self.valid_input_scope, prefix='\tTrue Negative Score Distributions:\t', suffix='Complete', bar_length=30) if keep: keep_file = write_hash_to_fasta( os.path.join( keep_dir, "{}_{}_TrueNegativeScores.txt".format( group, file_name)), score_hash, "{}\t{}\n") else: self.group_by_file_to_twin_score_obj[group][ file_name] = None count += 1 return self.group_by_file_to_twin_score_obj
def compute_all_hmm_scores(self): count = 1 print("\n") for group, file_list in self.group_to_file_list.items(): for file_name in file_list: fasta_hash = self.group_by_file_to_cluster_hash[group][ file_name] self.group_by_file_to_score_obj[group][ file_name] = ScoreObject( fasta_hash, self.group_by_file_to_hmm[group][file_name]) if len(fasta_hash) < 20: score_hash = self.group_by_file_to_score_obj[group][ file_name].iterative_score_computation() else: score_hash = self.group_by_file_to_score_obj[group][ file_name].bulk_score_computation() print_progress(count, self.valid_input_scope, prefix='\tComputing HMM Score Distributions:\t', suffix='Complete', bar_length=30) count += 1 if keep: keep_file = write_hash_to_fasta( os.path.join( keep_dir, "{}_{}_scores.txt".format(group, file_name)), score_hash, "{}\t{}\n") return self.group_by_file_to_score_obj
def write_length_binned_fasta(fasta_dict, cluster_name, location): """takes a dictionary in in style of {'fasta_header':sequence} and outputs a fasta file with header in style of {'>cluster_name_bin': sequence} at the specified 'location'.""" length_hash = {header: len(seq) for header, seq in fasta_dict.items()} if not length_hash: raise AttributeError("! No length hash") if subset: bin_to_header = bin_sequence_lengths(length_hash) header_to_sequence = { header: fasta_dict[header] for bin_x, header in bin_to_header.items() } file_path = write_hash_to_fasta(location, header_to_sequence, "{}_" + cluster_name + "\n" + "{}\n") else: file_path = write_hash_to_fasta(location, fasta_dict, "{}_" + cluster_name + "\n" + "{}\n") return length_hash
def generate_hmm_and_filtered_fasta(self, directory): count = 1 removed_group_to_file_list = defaultdict(list) for group, file_list in self.group_to_file_list.items(): for file_name in file_list: msa_list = generate_msa( self.group_by_file_to_filepath[group][file_name]) msa_obj = MsaObject(msa_list, file_name, directory) msa_obj.msa_to_fasta() msa_obj.trim_remove() if msa_obj.check_msa_size_and_length() is True: if msa_obj.size_history[0] != msa_obj.size_history[-1]: self.group_by_file_to_cluster_hash[group][ file_name] = clean_fasta_hash( self.group_by_file_to_cluster_hash[group] [file_name], msa_obj.all_header(), file_name) same_msa_path = write_hash_to_fasta( msa_obj.file_path, self.group_by_file_to_cluster_hash[group] [file_name], ">{}\n{}\n") msa_obj.re_align(same_msa_path) self.group_by_file_to_msa_obj[group][file_name] = msa_obj length_hash = write_length_binned_fasta( self.group_by_file_to_cluster_hash[group][file_name], file_name, os.path.join(output_dir, file_name + ".fasta")) self.group_by_file_to_length_range[group][ file_name] = calculate_length_range(length_hash) self.group_by_file_to_hmm[group][file_name] = generate_hmm( os.path.join(output_dir, file_name + ".hmm"), msa_obj.file_path) else: removed_group_to_file_list[group].append(file_name) logger_Filtered.warning( "Filtered due to MSA benchmarking {}".format( file_name)) print_progress(count, self.valid_input_scope, prefix='\tGenerating Hidden Markov Models:\t', suffix='Complete', bar_length=30) count += 1 return self.remove_filtered_files(removed_group_to_file_list)
def bulk_score_computation(self): with tmp.NamedTemporaryFile() as q_tmp: write_hash_to_fasta(q_tmp.name, self.fasta_hash) self.score_dict = get_phmm_score(self.hmm_path, q_tmp.name) return self.score_dict
def test_formatting_None_if_more_then_two_brackets(self): with tmp.NamedTemporaryFile() as hash_tmp: file_path = shared_code_box.write_hash_to_fasta(hash_tmp.name, {">1" : "ATGSAD", ">2": "ADFAT"}, line_style="{}\n{}\n{}\n") self.assertEqual(file_path, None)
def test_formatting_without_dictionary_value(self): with tmp.NamedTemporaryFile() as hash_tmp: file_path = shared_code_box.write_hash_to_fasta(hash_tmp.name, {">1" : "ATGSAD", ">2": "ADFAT"}, line_style="{}") self.assertSetEqual(set(open(file_path).readlines()[0].split(">")), {'', '2', '1'})
def test_rigth_optional_formatting(self): with tmp.NamedTemporaryFile() as hash_tmp: file_path = shared_code_box.write_hash_to_fasta(hash_tmp.name, {">1" : "ATGSAD", ">2": "ADFAT"}, line_style="{}\t{}\t") self.assertSetEqual(set(open(file_path).readlines()[0].split("\t")), {'>2', 'ADFAT', '>1', 'ATGSAD', ''})
def test_empty_hash(self): with tmp.NamedTemporaryFile() as hash_tmp: file_path = shared_code_box.write_hash_to_fasta(hash_tmp.name, {}) self.assertEqual(file_path, None)
def test_fasta_dictionary(self): with tmp.NamedTemporaryFile() as hash_tmp: file_path = shared_code_box.write_hash_to_fasta(hash_tmp.name, {">1" : "ATGSAD", ">2": "ADFAT"}) self.assertSetEqual(set(open(file_path).readlines()), {'>2\n', 'ADFAT\n', '>1\n', 'ATGSAD\n'})