import shutil import os.path import glob import csv import pandas as pd import primer3 import sys import numpy as np import re from Santalucia_NN_Tm import NN_Tm, complement, mM_monovalent from Bio import SeqIO from time_stamp import Time_stamp from parameters import * from sequence_processing_functions import fasta_to_seq, gc_content, rev_complement monovalent_cation_eq = mM_monovalent(Na=Na, K=K, Tris=Tris, Mg=Mg, dNTPs=dNTPs) ############################################################ #### FUNCTIONS ############################################################ ### FUNCTION TO CALCULATE HAIRPIN TM def hairpin_Tm(primer_sequence, mv_cation=0,primer_conc=0): Tm_hairpin = (primer3.calcHairpin(primer_sequence,mv_conc=mv_cation, dv_conc=0, dntp_conc=0, dna_conc=primer_conc, temp_c=37, max_loop=30)).tm return ("{0:.2f}".format(round(Tm_hairpin,2))) ### FUNCTION TO CALCULATE HOMODIMER TM def homodimer_Tm(primer_sequence, mv_cation=0,primer_conc=0): Tm_homodimer = (primer3.calcHomodimer(primer_sequence,mv_conc=mv_cation, dv_conc=0, dntp_conc=0, dna_conc=primer_conc, temp_c=37, max_loop=30)).tm return ("{0:.2f}".format(round(Tm_homodimer,2)))
def main(): seq_file = open(outdir + "sequence.txt", "r") #pdb.set_trace() parameters_used = open(outdir + 'run_summary.txt', 'a') unblast_file = open(outdir + "UOD_featureFilter.txt", "w") #after feature filter UOD_all_fasta = outdir + "UOD_featureFilter.fasta" FRprimer = open(UOD_all_fasta, "w") #similar with unblast_file FPrimer = open(outdir + "UOD_forward_primer.fasta", "w") #final all forward primers RPrimer = open(outdir + "UOD_reverse_primer.fasta", "w") #final all reverse primers final_UOD_primer = open( outdir + "UOD_final_primer.txt", "w" ) #UOD final primer result include all primers include forward and reverse primers final_UOD_all_primer_info = open( outdir + "UOD_final_all_primer_info.fasta", "w") final_UOD_all_primer_fasta = open( outdir + "UOD_final_all_primer_fasta.fasta", "w") unblast_file.write( "chrom\tstart\tend\toccurrence\tsequence\tlen\tstrand\ttm\n") monovalent_cation_eq = mM_monovalent(Na=Na, K=K, Tris=Tris, Mg=Mg, dNTPs=dNTPs) primer_dict_all = dict() primer_num = 0 #open TRS.py result sequence.txt for line in seq_file.readlines(): if line.startswith('>'): primer_num += 1 llist = line.rstrip().split('_') chrom, start = llist[1], llist[2] else: #chop region and filter primer primer_l_min, primer_l_max = int( primer_size_range.split("-")[0]), int( primer_size_range.split("-")[1]) for primer_l in range(primer_l_min, primer_l_max + 1, 1): primer_dict = ChopImputSeq(line.rstrip(), primer_num, primer_l, chrom, start, minTm, maxTm, GC_range_min, GC_range_max, CheckATends_flag, \ CheckGCclamp_flag, NucleotideRepeatFilter_flag, NucleotideRepeatFilter_threshold, self_Tmdiff, monovalent_cation_eq) primer_dict_all.update(primer_dict) no_primers_designed = len(primer_dict_all) sorted_primer_dict = sorted(primer_dict_all.iteritems(), key=itemgetter(1), reverse=False) #write meet all condition primers to "UOD_featureFilter.txt" and "UOD_forward_primer.fasta" and "UOD_reverse_primer.fasta" for primer_info in sorted_primer_dict: primer = primer_info[0] primer_region_num = primer_info[1][0] chrom = primer_info[1][1] start = int(primer_info[1][2]) + primer_info[1][3] end = start + primer_info[1][4] occu = primer_info[1][5] length = primer_info[1][4] tm = primer_info[1][6] strand = primer_info[1][7] unblast_file.write("%d\t%s\t%d\t%d\t%d\t%s\t%d\t%s\t%f\n" % (primer_region_num, chrom, start, end, occu, primer, length, strand, tm)) FRprimer.write(">TA_" + str(primer_region_num) + "_" + chrom + "_" + str(start) + "_" + str(length) + "_" + str(tm) + "_" + str(strand) + "\n") FRprimer.write(primer + "\n") unblast_file.close() FRprimer.close() #judge whether the genome file is exist if not os.path.exists(genome_fasta): sys.stderr.write('\r[*] Please give the hg19.fasta file[hg19.fasta]\n') exit(-1) os.chdir(refDB_path) #create database of balstn file_set = { "zmv2all.nin", "zmv2all.nhr", "zmv2all.nsq", "zmv2all.nsi", "zmv2all.nsd", "zmv2all.nog" } if set(glob.glob("zmv2all.*")) < file_set: f0 = open(os.devnull, 'w') sp.call([ "makeblastdb", "-in", "%s" % genome_fasta, "-dbtype", "nucl", "-parse_seqids", "-out", "%szmv2all" % refDB_path ], stdout=f0, stderr=f0) word_size = int(primer_size_range.split("-")[0]) #exact blast word size fasta_input_file = UOD_all_fasta #make exact balst for the forward primer sequence p1 = sp.Popen(["blastn","-task","blastn","-db","%szmv2all" %refDB_path,"-query","%s" %fasta_input_file,"-evalue","%s" %em_e_value,"-word_size","%s" \ %word_size,"-gapopen","%s" %em_gapopen,"-gapextend","%s" %em_gapextend,"-reward","%s" %em_reward,"-penalty","%s" %em_penalty,"-dust","no", \ "-perc_identity","%s" %em_perc_identity,"-max_target_seqs","%s" %em_max_target_seqs,"-max_hsps","%s" %em_max_hsps, \ "-outfmt","10 qseq qlen qseqid sacc sstart send sstrand", "-num_threads","%s" %em_num_threads],stdout=sp.PIPE) exact_match_output, error = p1.communicate() #process the blast result exact_match_set = set() for exact_match_output_line in exact_match_output.split('\n')[:-1]: print("%s" % (exact_match_output_line)) exact_match_output_line = exact_match_output_line.strip(' ').split(',') Primer = exact_match_output_line[0] qseqid = exact_match_output_line[2].split('_') qseq_chr = qseqid[2] qseq_start = int(qseqid[3]) qseq_strand = qseqid[6] qseq_stop = int(qseq_start) + int(qseqid[4]) targetseq_chr = exact_match_output_line[3] targetseq_start = int(exact_match_output_line[4]) targetseq_stop = int(exact_match_output_line[5]) alignment_length = len(Primer) query_length = int(exact_match_output_line[1]) if alignment_length == query_length: if qseq_chr != targetseq_chr: #off-target primer sequence//chrom dont same exact_match_set.add(Primer) if qseq_chr == targetseq_chr: if qseq_strand == "+": if (qseq_start + 1) != targetseq_start and ( qseq_stop - 1 ) != targetseq_stop: #off-target primer sequence//chrom same but position dont same exact_match_set.add(Primer) if qseq_strand == "-": if (qseq_start + 1) != targetseq_stop and ( qseq_stop - 1) != targetseq_start: #follow the same exact_match_set.add(Primer) ### Remove from original dictionary, those primers with exact matches elsewhere in the genome for primer_exact_match in exact_match_set: if primer_exact_match in primer_dict_all: primer_dict_all.pop(primer_exact_match, None) #pop the primer have the off-target no_primers_no_exact_match = len(primer_dict_all) #write the primer sequence after exact balst final_UOD_primer.write("chrom\tstart\tend\tseq\ttm\tstrand\n") for primer, value in primer_dict_all.items(): primer_region_num = value[0] chrom = value[1] primer_start_pos = int(value[2]) + value[3] primer_end_pos = int(value[2]) + value[3] + value[4] tm = value[6] strand = value[7] if strand == "+": FPrimer.write(">TA_" + str(primer_region_num) + "_" + chrom + "_" + str(primer_start_pos) + "_" + str(len(primer)) + "_" + str(tm) + "_" + strand + "_" + primer + "\n") FPrimer.write(primer + "\n") if strand == "-": RPrimer.write(">TA_" + str(primer_region_num) + "_" + chrom + "_" + str(primer_start_pos) + "_" + str(len(primer)) + "_" + str(tm) + "_" + strand + "_" + primer + "\n") RPrimer.write(primer + "\n") final_UOD_primer.write( str(primer_region_num) + "\t" + chrom + "\t" + str(primer_start_pos) + "\t" + str(primer_end_pos) + "\t" + primer + "\t" + str(tm) + "\t" + strand + "\n") final_UOD_all_primer_info.write(">TA_" + str(primer_region_num) + "_" + chrom + "_" + str(primer_start_pos) + "_" + str(len(primer)) + "_" + str(tm) + "\n") final_UOD_all_primer_info.write(primer + "\n") final_UOD_all_primer_fasta.write(">TA_" + str(primer_region_num) + "_" + chrom + "_" + str(primer_start_pos) + "_" + str(len(primer)) + "_" + str(tm) + "_" + strand + "_" + primer + "\n") final_UOD_all_primer_fasta.write(primer + "\n") ############################################################ #Time to run the code: end timer ############################################################ t1 = time.time() total = t1 - t0 total = ("{0:.2f}".format(round(total, 2))) parameters_used.write( "no. of primers designed based on filter criteria : " + str(no_primers_designed) + '\n' "no. of primers without exact match : " + str(no_primers_no_exact_match) + '\n' "### UOD run duration : " + str(total) + " seconds" + '\n' "##########################################################" + "\n" + "\n" + "\n") parameters_used.close()