def best_exonerate_prediction2(region_fasta, query_fasta, dir_path, hmm): with tmp.NamedTemporaryFile() as reg_file: write_to_tempfile(reg_file.name, region_fasta) ex_obj = run_exonerate("-m p2g -E no", "{}.exon_p2g", dir_path, region_fasta, query_fasta) if ex_obj: all_proteins = all_proteins_to_fasta_string(ex_obj) TP_scores = markov_model_scoring2(all_proteins, hmm) if TP_scores: print(TP_scores) max_score = max(list(TP_scores.values())) max_score_header = set([ header.split(";")[0] for header, score in TP_scores.items() if score >= max_score * 0.90 ]) fasta_hash = hash_fasta(query_fasta) # do that before with tmp.NamedTemporaryFile() as q_file: max_val_fasta = "\n".join([ "{}\n{}".format(header, fasta_hash[header]) for header in max_score_header ]) write_to_tempfile(q_file.name, max_val_fasta) ex_name = "{}.exon".format(q_file.name) ex_obj = run_exonerate("-m p2g -E no ", ex_name, dir_path, region_fasta, q_file.name) return ex_obj return None
class TestHashFasta(unittest.TestCase): fa_hash = shared_code_box.hash_fasta(os.path.join(test_data, file_name)) @patch("build_models.hash_fasta", return_value={">a": 1, ">b": 2}) @patch("build_models.logger_Filtered", return_value=None) def test_check_and_hash_fasta(self, logger, fasta_hash): fasta_hash = build_models.check_and_hash_fasta(os.path.join(test_data, file_name), "filename") self.assertEqual(fasta_hash, None) @patch("build_models.hash_fasta", return_value={}) @patch("build_models.logger_Filtered", return_value=None) def test_check_and_hash_fasta(self, logger, fasta_hash): fasta_hash = build_models.check_and_hash_fasta(os.path.join(test_data, file_name), "filename") self.assertEqual(fasta_hash, None) @patch("build_models.hash_fasta", return_value={">a": 1, ">b": 2, ">c": 3}) @patch("build_models.logger_Filtered", return_value=None) def test_check_and_hash_fasta(self, logger, fasta_hash): fasta_hash = build_models.check_and_hash_fasta(os.path.join(test_data, file_name), "filename") self.assertEqual(fasta_hash, {">a": 1, ">b": 2, ">c": 3}) def test_hash_fasta_equal_number_header_seq(self): number_seq = len(self.fa_hash.keys()) number_header = len(self.fa_hash.values()) self.assertEqual(number_header, number_seq) def test_hash_fasta_number_of_entries(self): number_entries = len(self.fa_hash.keys()) self.assertEqual(number_entries, 190) def test_hash_fasta_correct_seq_length(self): single_seq = len(self.fa_hash[">AMELL.GB42352-PA"]) self.assertEqual(single_seq, 942)
def check_and_hash_fasta(fasta_file, file_name): """wrapper for hash_fasta function to check if hash has sufficient entries""" fasta_dict = hash_fasta(fasta_file) if fasta_dict and len(fasta_dict) <= 2: logger_Filtered.warning( "INPUT ERROR: '{}' - has less than 2 sequences\n".format( file_name)) return None else: return fasta_dict
true_negative_file = args['--orthofinder_files'] check_programs("hmmsearch", "hmmemit", "hmmbuild", "mafft", "trimal") print("\n{}\n# GenePS #\n{}\n\nPreparing Files...\n".format( "#" * 10, "#" * 10)) # if true negative translation files provided blast_specID_protID_hitList = defaultdict(lambda: defaultdict(list)) idPair_2_namePair, namePair_2_idPair = {}, {} all_protein_fasta_dict = {} if true_negative_file: tn_args = parse_true_negative_arg(true_negative_file) blast_path, blast_file_set = get_blast_files(tn_args["blast_dir"]) idPair_2_namePair, namePair_2_idPair = hash_sequence_translation_file( tn_args["sequenceIDs"]) all_protein_fasta_dict = hash_fasta(tn_args["proteins"]) species_ids = hash_species_translation_file(tn_args["speciesIDs"]) number_blast_files = hash_all_blast_files(species_ids) if not number_blast_files - len(species_ids) == 0: print("\t[!] FATAL ERROR: Not all Blast files could be hashed\n") sys.exit() if keep: keep_dir = get_outdir(output_dir, add_dir="intermediate_files") log_path = os.path.join(output_dir, "LOG.txt") logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M', filename=log_path, filemode='w')
def load_data_and_initialize_global_variables(self): def mod_next(): return next(mg).strip().split(":") cluster_count = 0 input_scope = 0 error_list = [] for subdir, dirs, files in os.walk(self.gene_ps_results): for file_path_str in files: if file_path_str.split(".")[-1] == "GenePS" and len( file_path_str.split(".")) > 1: group_file = os.path.join(subdir, file_path_str) with open(group_file) as mg: group_name = mg.readline().split(":")[1].strip() self.group_names.append(group_name) self.group_to_consensus_file[ group_name] = os.path.join( self.gene_ps_results, group_name + ".fa.consensus") self.group_to_original_path[group_name] = group_file self.group_to_group_size[group_name] = int( mg.readline().split(":")[1].strip()) input_scope += self.group_to_group_size[group_name] for line in mg: if line.startswith("#name:"): cluster = line.split(":")[1].strip() cluster_count += 1 score = mod_next()[1].strip().split(",") self.group_by_cluster_to_score_cutoff[ group_name][cluster] = float(score[0]) self.group_by_cluster_to_length_range[ group_name][cluster] = mod_next()[1].strip( ).split(",") if os.path.exists( os.path.join(self.gene_ps_results, cluster + ".TN_hmm")): self.group_by_cluster_to_TN_hmm[ group_name][cluster] = os.path.join( self.gene_ps_results, cluster + ".TN_hmm") else: self.group_by_cluster_to_TN_hmm[ group_name][cluster] = None files_not_found = self.validate_path_files( cluster) if not files_not_found: self.group_by_cluster_to_hmm[group_name][ cluster] = os.path.join( self.gene_ps_results, cluster + ".hmm") self.group_by_cluster_to_fasta_file[ group_name][cluster] = os.path.join( self.gene_ps_results, cluster + ".fasta") self.group_by_cluster_to_fasta_hash[ group_name][cluster] = hash_fasta( self. group_by_cluster_to_fasta_file[ group_name][cluster]) else: error_list += files_not_found else: pass return self.check_loaded_data(cluster_count, input_scope, error_list)
iterations = int(args['--iterations']) repeat_limit = int(args['--repeat_limit']) out_dir = args['--output_path'] out_file = open(out_dir + ".txt", "w") # iterations = how often gets the dataset enlarged # number_prot = number of proteins added to the dataset per iteration # iterations * numbers = total amount of protein seq # per iteration get average scores for each model till iter * number_prot is reached # results in an "average-list" for both models # for each repeat step, repeat this computation form above # for each repeat step one average_list # average the average_lists # plot average of average and min of average and max of average for both models cluster_hash = hash_fasta(cluster_file) x_axis = [number_prot * x for x in range(1, iterations + 1)] repeat_step = 1 seq_header_list = [] bulk_av_list = [] iter_av_list = [] while repeat_step <= repeat_limit: av_single_hmm, av_multiple_hmm = compare_models( iterations, number_prot) print("{} round of {}".format(str(repeat_step), str(repeat_limit))) print(av_single_hmm) print(av_multiple_hmm) bulk_av_list.append(av_single_hmm) iter_av_list.append(av_multiple_hmm) seq_header_list = []