Esempio n. 1
0
 def compute_true_negative_hmm_scores(self):
     count = 1
     print("\n")
     for group, file_list in self.group_to_file_list.items():
         for file_name in file_list:
             hmm = self.group_by_file_to_hmm[group][file_name]
             fasta_hash = self.make_cluster_specific_TN_hash(
                 group, file_name)
             if fasta_hash and len(fasta_hash) >= 10:
                 scoring_obj = ScoreObject(fasta_hash, hmm)
                 score_hash = scoring_obj.bulk_score_computation()
                 self.group_by_file_to_twin_score_obj[group][
                     file_name] = scoring_obj
                 if len(fasta_hash) >= 20:
                     self.group_by_file_to_twin_hmm[group][
                         file_name] = scoring_obj.compute_full_phmm(
                             os.path.join(output_dir,
                                          file_name + ".TN_hmm"))
                 print_progress(
                     count,
                     self.valid_input_scope,
                     prefix='\tTrue Negative Score Distributions:\t',
                     suffix='Complete',
                     bar_length=30)
                 if keep:
                     keep_file = write_hash_to_fasta(
                         os.path.join(
                             keep_dir,
                             "{}_{}_TrueNegativeScores.txt".format(
                                 group, file_name)), score_hash, "{}\t{}\n")
             else:
                 self.group_by_file_to_twin_score_obj[group][
                     file_name] = None
             count += 1
     return self.group_by_file_to_twin_score_obj
Esempio n. 2
0
 def compute_all_hmm_scores(self):
     count = 1
     print("\n")
     for group, file_list in self.group_to_file_list.items():
         for file_name in file_list:
             fasta_hash = self.group_by_file_to_cluster_hash[group][
                 file_name]
             self.group_by_file_to_score_obj[group][
                 file_name] = ScoreObject(
                     fasta_hash,
                     self.group_by_file_to_hmm[group][file_name])
             if len(fasta_hash) < 20:
                 score_hash = self.group_by_file_to_score_obj[group][
                     file_name].iterative_score_computation()
             else:
                 score_hash = self.group_by_file_to_score_obj[group][
                     file_name].bulk_score_computation()
             print_progress(count,
                            self.valid_input_scope,
                            prefix='\tComputing HMM Score Distributions:\t',
                            suffix='Complete',
                            bar_length=30)
             count += 1
             if keep:
                 keep_file = write_hash_to_fasta(
                     os.path.join(
                         keep_dir,
                         "{}_{}_scores.txt".format(group, file_name)),
                     score_hash, "{}\t{}\n")
     return self.group_by_file_to_score_obj
Esempio n. 3
0
def write_length_binned_fasta(fasta_dict, cluster_name, location):
    """takes a dictionary in in style of {'fasta_header':sequence} and outputs
    a fasta file with header in style of {'>cluster_name_bin': sequence} at the specified 'location'."""
    length_hash = {header: len(seq) for header, seq in fasta_dict.items()}
    if not length_hash:
        raise AttributeError("! No length hash")
    if subset:
        bin_to_header = bin_sequence_lengths(length_hash)
        header_to_sequence = {
            header: fasta_dict[header]
            for bin_x, header in bin_to_header.items()
        }
        file_path = write_hash_to_fasta(location, header_to_sequence,
                                        "{}_" + cluster_name + "\n" + "{}\n")
    else:
        file_path = write_hash_to_fasta(location, fasta_dict,
                                        "{}_" + cluster_name + "\n" + "{}\n")
    return length_hash
Esempio n. 4
0
 def generate_hmm_and_filtered_fasta(self, directory):
     count = 1
     removed_group_to_file_list = defaultdict(list)
     for group, file_list in self.group_to_file_list.items():
         for file_name in file_list:
             msa_list = generate_msa(
                 self.group_by_file_to_filepath[group][file_name])
             msa_obj = MsaObject(msa_list, file_name, directory)
             msa_obj.msa_to_fasta()
             msa_obj.trim_remove()
             if msa_obj.check_msa_size_and_length() is True:
                 if msa_obj.size_history[0] != msa_obj.size_history[-1]:
                     self.group_by_file_to_cluster_hash[group][
                         file_name] = clean_fasta_hash(
                             self.group_by_file_to_cluster_hash[group]
                             [file_name], msa_obj.all_header(), file_name)
                     same_msa_path = write_hash_to_fasta(
                         msa_obj.file_path,
                         self.group_by_file_to_cluster_hash[group]
                         [file_name], ">{}\n{}\n")
                     msa_obj.re_align(same_msa_path)
                 self.group_by_file_to_msa_obj[group][file_name] = msa_obj
                 length_hash = write_length_binned_fasta(
                     self.group_by_file_to_cluster_hash[group][file_name],
                     file_name,
                     os.path.join(output_dir, file_name + ".fasta"))
                 self.group_by_file_to_length_range[group][
                     file_name] = calculate_length_range(length_hash)
                 self.group_by_file_to_hmm[group][file_name] = generate_hmm(
                     os.path.join(output_dir, file_name + ".hmm"),
                     msa_obj.file_path)
             else:
                 removed_group_to_file_list[group].append(file_name)
                 logger_Filtered.warning(
                     "Filtered due to MSA benchmarking {}".format(
                         file_name))
             print_progress(count,
                            self.valid_input_scope,
                            prefix='\tGenerating Hidden Markov Models:\t',
                            suffix='Complete',
                            bar_length=30)
             count += 1
     return self.remove_filtered_files(removed_group_to_file_list)
Esempio n. 5
0
 def bulk_score_computation(self):
     with tmp.NamedTemporaryFile() as q_tmp:
         write_hash_to_fasta(q_tmp.name, self.fasta_hash)
         self.score_dict = get_phmm_score(self.hmm_path, q_tmp.name)
     return self.score_dict
Esempio n. 6
0
 def test_formatting_None_if_more_then_two_brackets(self):
     with tmp.NamedTemporaryFile() as hash_tmp:
         file_path = shared_code_box.write_hash_to_fasta(hash_tmp.name, {">1" : "ATGSAD", ">2": "ADFAT"}, line_style="{}\n{}\n{}\n")
         self.assertEqual(file_path, None)
Esempio n. 7
0
 def test_formatting_without_dictionary_value(self):
     with tmp.NamedTemporaryFile() as hash_tmp:
         file_path = shared_code_box.write_hash_to_fasta(hash_tmp.name, {">1" : "ATGSAD", ">2": "ADFAT"}, line_style="{}")
         self.assertSetEqual(set(open(file_path).readlines()[0].split(">")), {'', '2', '1'})
Esempio n. 8
0
 def test_rigth_optional_formatting(self):
     with tmp.NamedTemporaryFile() as hash_tmp:
         file_path = shared_code_box.write_hash_to_fasta(hash_tmp.name, {">1" : "ATGSAD", ">2": "ADFAT"}, line_style="{}\t{}\t")
         self.assertSetEqual(set(open(file_path).readlines()[0].split("\t")), {'>2', 'ADFAT', '>1', 'ATGSAD', ''})
Esempio n. 9
0
 def test_empty_hash(self):
     with tmp.NamedTemporaryFile() as hash_tmp:
         file_path = shared_code_box.write_hash_to_fasta(hash_tmp.name, {})
         self.assertEqual(file_path, None)
Esempio n. 10
0
 def test_fasta_dictionary(self):
     with tmp.NamedTemporaryFile() as hash_tmp:
         file_path = shared_code_box.write_hash_to_fasta(hash_tmp.name, {">1" : "ATGSAD", ">2": "ADFAT"})
         self.assertSetEqual(set(open(file_path).readlines()), {'>2\n', 'ADFAT\n', '>1\n', 'ATGSAD\n'})