def Perseus_part_2(fn_fasta, ref_seq, exclusion_list, number_of_primers_to_generate, number_of_primers_to_review, inclusion_set_threshold): from pcr_tools import generate_seq_dictionary master_dictionary = generate_seq_dictionary(fn_fasta) chosen_sequence = str(ref_seq) from pcr_tools import write_subfasta write_subfasta(exclusion_list, master_dictionary, "exclusion_set.fna") fh = open(chosen_sequence + ".best100primers" + ".InSet." + str(inclusion_set_threshold),'r') second_scored_primers = open(chosen_sequence + ".scored_exclusion." + "InSet." + str(inclusion_set_threshold) , 'w') trigger = 0 from pcr_tools import hit_percentage_calc_plus_discovery while trigger < number_of_primers_to_review - 1: trigger = trigger + 1 line = fh.readline().strip() # the idea here is to read one line at a time as we might not need to go forever. # Get the primer from the line try: p1 = line.split()[2] p2 = line.split()[3] ##############KMB FIX Jan 31, 2013. #################### # NEED A BREAK COMMAND IF PRIMERS ARE NOT SUFFICIENTLY INCLUSIVE inclusivity_metric_no_MM = line.split()[9] inclusivity_metric_two_MM = line.split()[11] if float(inclusivity_metric_two_MM) < 0.01 or float(inclusivity_metric_no_MM) < 0.01: # THIS MEANS ALLOWING FOR TWO TOTAL MISMATCHES ALL OF THE INCLUSION SET MUST BE HIT continue # Write fuznuc pattern from pcr_tools import write_fuznuc_pattern write_fuznuc_pattern(p1,p2, 'temp.pat', mismatches = 4) # Search from pcr_tools import execute_fuznuc execute_fuznuc("temp.pat","exclusion_set.fna", "temp.fzout") from pcr_tools import parse_fzout D = parse_fzout("temp.fzout") # import pprint # pp = pprint.PrettyPrinter(indent=4) # pp.pprint(D) X = [1,3,5,7,9] output_list = [] hit_list = [] second_scored_primers.write(line + "\t") for x in X: R_Dict = hit_percentage_calc_plus_discovery(exclusion_list,D,x) # Needs the list of sequences to exclude # output_list.append(R_Dict['hit_percentage']) hit_list = R_Dict['hits'] if float(R_Dict['hit_percentage']) != 0: second_scored_primers.write(str(R_Dict['hit_percentage']) + "\t" + '|'.join(map(str, hit_list)) + "\t") else: second_scored_primers.write(str(R_Dict['hit_percentage']) + "\t" + 'No_Hits' + "\t") # second_scored_primers.write(str(R_Dict['hit_percentage']) + "\t" + '|'.join(map(str, hit_list))+ "\t") #print hit_list second_scored_primers.write("\n") except IndexError: continue #hit_list = list(set(hit_list)) # get uniques only, convert back to a hit_list #print hit_list #Output the key lines and the hit percentages #second_scored_primers.write(line + "\t" + "\t".join(map(str,output_list)) + "\t" + '|'.join(map(str, hit_list)) + "\n") second_scored_primers.close() fh.close()
def Perseus_part_1(fn_fasta, ref_seq, inclusion_list, number_of_primers_to_generate, number_of_primers_to_review, inclusion_set_threshold): print inclusion_list from pcr_tools import generate_seq_dictionary master_dictionary = generate_seq_dictionary(fn_fasta) # Write a seq dictionary from a fasta, for later use chosen_sequence = str(ref_seq) from pcr_tools import write_subfasta write_subfasta(inclusion_list,master_dictionary, "inclusion_set.fna") # we pass the List and Dictionary #write_subfasta(exclusion_list,master_dictionary, "exclusion_set.fna") # we pass the Second List and Dictionary ref_seq_list = [str(ref_seq)] # make a list of one element, for using write subfasta function write_subfasta(ref_seq_list, master_dictionary, "choosen.fna") #################################### # NEED TO REMOVE ANY BASES OTHER THAN ATCG (KMB, BUG FIX JAN 30, 2012) fh_fix = open("choosen.fna", 'r') header = fh_fix.readline() line = fh_fix.readline() import string new_line = line.translate(string.maketrans("RYSWK", "NNNNN")) # GET RID OF MOST COMMON NON_STANDARD AMIGUOUS NUCLEOTIDES fh_fix.close() fh_fix = fh_fix = open("choosen.fna", 'w') fh_fix.write(str(header) + str(new_line)) fh_fix.close() ################################## from pcr_tools import fna_to_pr3_v2_3_4_input fna_to_pr3_v2_3_4_input("choosen.fna", "choosen.pr3in", number_of_primers_to_generate) # (1) Generate the primer3 input from pcr_tools import pr3 pr3("choosen.pr3in", "choosen.pr3out") # (2) Run Primer 3, produce the primer 3 output file from pcr_tools import parse_pr3_v2_3_4_out parse_pr3_v2_3_4_out("choosen.pr3out", chosen_sequence + ".pr3set") #(3) generate the output file of primers # This block goes through the candidate assays and tests what sequences have complimentary targets fh = open(chosen_sequence + ".pr3set", 'r') all_scored_primers = open(chosen_sequence + ".scoredprimers" + ".InSet." + str(inclusion_set_threshold), 'w') from pcr_tools import hit_percentage_calc_plus_discovery for line in fh: line = line.strip() primer_id = line.split()[0] p1 = line.split()[2] p2 = line.split()[3] from pcr_tools import write_fuznuc_pattern # writes a temporary pattern file. write_fuznuc_pattern(p1,p2, 'temp.pat') from pcr_tools import execute_fuznuc # passes the pattern file and sequence files to fuznuc. #NOTE: ENSURE THE FOLLOWING IS INSTALLED in ~/EMBOSS-6.5.7/emboss/fuzznuc !!!!!!!! execute_fuznuc("temp.pat","inclusion_set.fna", "temp.fzout") from pcr_tools import parse_fzout # parse the fuznuc output and populate the dictionary D = parse_fzout("temp.fzout") #print D X = [1,3,5,6,9] # these integers are the number of mismatches allowed in each round output_list = [] hit_list = [] all_scored_primers.write(line + "\t") for x in X: R_Dict = hit_percentage_calc_plus_discovery(inclusion_list,D,x) hit_list = R_Dict['hits'] if float(R_Dict['hit_percentage']) > 0: all_scored_primers.write(str(R_Dict['hit_percentage']) + "\t" + '|'.join(map(str, hit_list)) + "\t") else: all_scored_primers.write(str(R_Dict['hit_percentage']) + "\t" + 'No_Hits' + "\t") all_scored_primers.write("\n") fh.close() all_scored_primers.close() # This blocks finds good primer pairs that best capture the most sequences in the desired cluster, ranks them by thermodynamic penalty. Note that you might want to instead say anything that captures 90 percent of the clusters will do fine and look for ones with better thermodynamics. from pcr_tools import sort_single_capture_primers_by_2_columns sort_single_capture_primers_by_2_columns(chosen_sequence + ".scoredprimers" + ".InSet." + str(inclusion_set_threshold), chosen_sequence + ".best100primers" + ".InSet." + str(inclusion_set_threshold), number_of_primers_to_review, 12, 10)
def Perseus_part_1_hmm_version(fn_fasta, ref_seq, inclusion_list, number_of_primers_to_generate, number_of_primers_to_review, inclusion_set_threshold): print inclusion_list from pcr_tools import generate_seq_dictionary master_dictionary = generate_seq_dictionary(fn_fasta) # Write a seq dictionary from a fasta, for later use chosen_sequence = str(ref_seq) from pcr_tools import write_subfasta write_subfasta(inclusion_list,master_dictionary, "inclusion_set.fna") # we pass the List and Dictionary #write_subfasta(exclusion_list,master_dictionary, "exclusion_set.fna") # we pass the Second List and Dictionary ############### ##### MODIFICATION # REMOVED: #ref_seq_list = [str(ref_seq)] # make a list of one element, for using write subfasta function #write_subfasta(ref_seq_list, master_dictionary, "choosen.fna") # Added: Quickly make choosen.fna from fasta file generated in import os os.system('more temp_fasta_fragment.fna > choosen.fna') ###################################################### #################################### # NEED TO REMOVE ANY BASES OTHER THAN ATCG (KMB, BUG FIX JAN 30, 2012) fh_fix = open("choosen.fna", 'r') header = fh_fix.readline() line = fh_fix.readline() import string new_line = line.translate(string.maketrans("RYSWK", "NNNNN")) # GET RID OF MOST COMMON NON_STANDARD AMIGUOUS NUCLEOTIDES fh_fix.close() fh_fix = fh_fix = open("choosen.fna", 'w') fh_fix.write(str(header) + str(new_line)) fh_fix.close() ################################## from pcr_tools import fna_to_pr3_v2_3_4_input fna_to_pr3_v2_3_4_input("choosen.fna", "choosen.pr3in", number_of_primers_to_generate) # (1) Generate the primer3 input from pcr_tools import pr3 pr3("choosen.pr3in", "choosen.pr3out") # (2) Run Primer 3, produce the primer 3 output file from pcr_tools import parse_pr3_v2_3_4_out parse_pr3_v2_3_4_out("choosen.pr3out", chosen_sequence + ".pr3set") #(3) generate the output file of primers # This block goes through the candidate assays and tests what sequences have complimentary targets fh = open(chosen_sequence + ".pr3set", 'r') all_scored_primers = open(chosen_sequence + ".scoredprimers" + ".InSet." + str(inclusion_set_threshold), 'w') from pcr_tools import hit_percentage_calc_plus_discovery for line in fh: line = line.strip() primer_id = line.split()[0] p1 = line.split()[2] p2 = line.split()[3] from pcr_tools import write_fuznuc_pattern # writes a temporary pattern file. write_fuznuc_pattern(p1,p2, 'temp.pat') from pcr_tools import execute_fuznuc # passes the pattern file and sequence files to fuznuc. #NOTE: ENSURE THE FOLLOWING IS INSTALLED in ~/EMBOSS-6.5.7/emboss/fuzznuc !!!!!!!! execute_fuznuc("temp.pat","inclusion_set.fna", "temp.fzout") from pcr_tools import parse_fzout # parse the fuznuc output and populate the dictionary D = parse_fzout("temp.fzout") #print D X = [1,3,5,6,9] # these integers are the number of mismatches allowed in each round output_list = [] hit_list = [] all_scored_primers.write(line + "\t") for x in X: R_Dict = hit_percentage_calc_plus_discovery(inclusion_list,D,x) hit_list = R_Dict['hits'] if float(R_Dict['hit_percentage']) > 0: all_scored_primers.write(str(R_Dict['hit_percentage']) + "\t" + '|'.join(map(str, hit_list)) + "\t") else: all_scored_primers.write(str(R_Dict['hit_percentage']) + "\t" + 'No_Hits' + "\t") all_scored_primers.write("\n") fh.close() all_scored_primers.close()