def variant_mask(chr_no, start_pos, stop_pos, variant_path, refDB_path, write_path, time_stamp, variant_mask_condition): indel_count = 0 snp_count = 0 filename = "chr" + str(chr_no) output_poly_masked = open( write_path + str(time_stamp) + "_" + str(chr_no) + "_" + str(start_pos) + "_" + str((stop_pos - start_pos) + 1) + "_VariantMasked.fasta", "w") output_poly_masked.write( str(">" + "TA_" + str(chr_no) + "_" + str(start_pos) + "_" + str((stop_pos - start_pos) + 1) + "_" + "VariantMasked" + '\n')) Locus = get_Locus(start_pos, stop_pos, chr_no, refDB_path) gap_count = Locus.count('N' * 100) locus_gc = gc_content(Locus) if variant_mask_condition == 1: df = pd.read_csv(variant_path + filename, sep='\t', header=0) df = df[(df.coordinate >= int(start_pos)) & (df.coordinate <= int(stop_pos))] for index, row in df.iterrows(): if '<INS>' in row['alternate_allele'] or '<DEL>' in row[ 'alternate_allele']: poly_coord = int(row['coordinate']) - (start_pos - 1) Locus = Locus[:(poly_coord - 1)] + "n" + Locus[ poly_coord:] # Indels annotated as "n" indel_count += 1 else: poly_coord = int(row['coordinate']) - (start_pos - 1) Locus = Locus[:(poly_coord - 1)] + "N" + Locus[ poly_coord:] # Polymorphisms other than indels annotated as "N" snp_count += 1 output_poly_masked.write(str(Locus)) output_poly_masked.close() return [indel_count, snp_count, gap_count, locus_gc]
def primer_filter (primer, minTm, maxTm, GC_range_min,GC_range_max, self_Tmdiff ,filter_AT_3prime, primer_dict, i, strand, filter_di_si_repeats, filter_GC_clamp): if check_compostion(primer): primer_GC = gc_content(primer) if primer_GC >= GC_range_min and primer_GC <= GC_range_max: primer_end_filter = check_ATends(primer, filter_AT_3prime) if primer_end_filter =="noATend_filter": primer_GC_clamp_filter = check_GC_clamp(primer, filter_GC_clamp ) if primer_GC_clamp_filter =="noGC_clamp_filter": primer_di_single_repeat_filter = di_single_nucleo_repeat_filter(primer, filter_di_si_repeats = filter_di_si_repeats, di_si_repeats_threshold = di_si_repeats_threshold) if primer_di_single_repeat_filter ==0: primer_Tm = float(NN_Tm(seq=primer, compl_seq=complement(primer), primer_conc=primer_conc, Na=Na, K=K, Tris=Tris, Mg=Mg, dNTPs=dNTPs, ion_corr=True)) if primer_Tm >=minTm and primer_Tm <=maxTm: Tm_hairpin = float(hairpin_Tm(primer, monovalent_cation_eq, primer_conc)) Tm_homodimer = float(homodimer_Tm(primer, monovalent_cation_eq, primer_conc)) if (primer_Tm - Tm_hairpin ) >= float(self_Tmdiff) and (primer_Tm - Tm_homodimer ) >= float(self_Tmdiff) : if primer not in primer_dict: primer_dict[primer] = [i, (primer_size), 1, primer_Tm, strand] else: primer_dict[primer][2] +=1
def primer_filter(primer, minTm, maxTm, GC_range_min, GC_range_max, self_Tmdiff, filter_AT_3prime, primer_dict, i, strand, filter_di_si_repeats, filter_GC_clamp): if check_compostion(primer): primer_GC = gc_content(primer) if primer_GC >= GC_range_min and primer_GC <= GC_range_max: primer_end_filter = check_ATends(primer, filter_AT_3prime) if primer_end_filter == "noATend_filter": primer_GC_clamp_filter = check_GC_clamp( primer, filter_GC_clamp) if primer_GC_clamp_filter == "noGC_clamp_filter": primer_di_single_repeat_filter = di_single_nucleo_repeat_filter( primer, filter_di_si_repeats=filter_di_si_repeats, di_si_repeats_threshold=di_si_repeats_threshold) if primer_di_single_repeat_filter == 0: primer_Tm = float( NN_Tm(seq=primer, compl_seq=complement(primer), primer_conc=primer_conc, Na=Na, K=K, Tris=Tris, Mg=Mg, dNTPs=dNTPs, ion_corr=True)) if primer_Tm >= minTm and primer_Tm <= maxTm: Tm_hairpin = float( hairpin_Tm(primer, monovalent_cation_eq, primer_conc)) Tm_homodimer = float( homodimer_Tm(primer, monovalent_cation_eq, primer_conc)) if (primer_Tm - Tm_hairpin) >= float(self_Tmdiff) and ( primer_Tm - Tm_homodimer) >= float(self_Tmdiff): if primer not in primer_dict: primer_dict[primer] = [ i, (primer_size), 1, primer_Tm, strand ] else: primer_dict[primer][2] += 1
def variant_mask(chr_no, start_pos, stop_pos, variant_path, refDB_path, write_path, time_stamp, variant_mask_condition): indel_count = 0 snp_count = 0 filename = "chr" + str(chr_no) output_poly_masked = open(write_path + str(time_stamp)+"_"+ str(chr_no) + "_"+ str(start_pos) + "_"+ str((stop_pos-start_pos)+1) + "_VariantMasked.fasta", "w") output_poly_masked.write(str(">"+"TA_"+ str(chr_no) + "_"+ str(start_pos) + "_"+ str((stop_pos-start_pos)+1) + "_"+"VariantMasked"+'\n')) Locus = get_Locus(start_pos, stop_pos, chr_no, refDB_path) gap_count = Locus.count('N'*100) locus_gc = gc_content(Locus) if variant_mask_condition == 1: df = pd.read_csv(variant_path+filename, sep='\t', header=0) df = df[(df.coordinate >= int(start_pos)) & (df.coordinate <= int(stop_pos))] for index, row in df.iterrows(): if '<INS>' in row['alternate_allele'] or '<DEL>' in row['alternate_allele']: poly_coord = int(row['coordinate'])-(start_pos-1) Locus = Locus[:(poly_coord-1)] + "n" + Locus[poly_coord:] # Indels annotated as "n" indel_count += 1 else : poly_coord = int(row['coordinate'])-(start_pos-1) Locus = Locus[:(poly_coord-1)] + "N" + Locus[poly_coord:] # Polymorphisms other than indels annotated as "N" snp_count += 1 output_poly_masked.write(str(Locus)) output_poly_masked.close() return [indel_count, snp_count, gap_count, locus_gc]
f0 = open(os.devnull, 'w') sp.call(["makeblastdb","-in","%s" %locus, "-dbtype","nucl","-parse_seqids","-out","%sloci_db" %query_path], stdout=f0,stderr=f0) ### Exact match to loci to get coords p5 = sp.Popen(["blastn","-db","%sloci_db" %query_path,"-query","%s" %fasta_input_file_FR,"-evalue","0.1","-word_size","%s" %word_size,"-gapopen","0","-gapextend","2","-reward","1","-penalty","-3","-dust","no","-perc_identity","100","-max_target_seqs", "13","-outfmt","10 qseqid length sstart send", "-num_threads","%s" %mp_num_threads],stdout=sp.PIPE,stderr=f0) exact_match_output_pooled_primers, error = p5.communicate() ### final pooled oligo coordinate output file f5 = open(query_path + path + Time_stamp + "_" + 'PSE_out3_1.csv','w') f5.write("Primer"+','+"Loci_start"+','+"Loci_stop"+','+"Genome_start"+','+"Genome_stop"+','+"Strand"+','+"Primer_Tm"+','+"Max_misprime_Tm"+','+"Tm_difference"+','+'Misprime_Tm_'+str(misprime_Tm_percentile_value)+'th_percentile'+','+"Primer_GC"+','+"Continuous_GC"+','+"3'_region_mismatches"+','+"Hairpin_Tm"+','+"Homodimer_Tm"+'\n') ### changed header name to Max_misprime_Tm for exact_match_pooled_output_line in exact_match_output_pooled_primers.split('\n')[:-1]: exact_match_pooled_output_line = exact_match_pooled_output_line.strip(' ').split(',') Primer_pooled = exact_match_pooled_output_line[0] primer_gc = gc_content(Primer_pooled) primer_continuous_gc= continuous_gc(Primer_pooled) query_len_pooled = len(Primer_pooled) match_len_pooled = int(exact_match_pooled_output_line[1]) sstart = int(exact_match_pooled_output_line[2]) sstart = (actual_locus_start_pos + sstart) - 1 send = int(exact_match_pooled_output_line[3]) send = (actual_locus_start_pos + send) - 1 if query_len_pooled == match_len_pooled: Primer_Tm_p = pooled_primer_f_r_dict[Primer_pooled][0] Max_misprime_Tm_p = pooled_primer_f_r_dict[Primer_pooled][1] Tm_difference_p = pooled_primer_f_r_dict[Primer_pooled][2] three_prime_region_mismatches = pooled_primer_f_r_dict[Primer_pooled][3] misprime_Tm_percentile = pooled_primer_f_r_dict[Primer_pooled][4] Hairpin_Tm = pooled_primer_f_r_dict[Primer_pooled][5] Homodimer_Tm = pooled_primer_f_r_dict[Primer_pooled][6]
def pick_primer_pairs(input_file): G=nx.Graph() primer_list = [] d_coords_plus = {} d_coords_minus = {} counter = 0 f = open(input_file) csv_f = csv.DictReader(f, delimiter=',') for row in csv_f: primer, p_start_pos, p_stop_pos, strand, Tm, max_misprimeTm, GC = row['Primer'], int(row['Genome_start']), int(row['Genome_stop']), row['Strand'], \ float(row['Primer_Tm']), (row['Max_misprime_Tm']), float(row['Primer_GC']) if max_misprimeTm == "3prime_mismatch": max_misprimeTm=0 else: max_misprimeTm=float(max_misprimeTm) if Tm - max_misprimeTm >= pair_misprimeTm_diff: primer_list.append(primer) G.add_node(primer, p_start_pos=p_start_pos, p_stop_pos=p_stop_pos, strand=strand, Tm=Tm, max_misprimeTm=max_misprimeTm, GC=GC) unique_primers = {} primer_pair_coords = [] amplicon_len_list = [] amplicon_coords_list = [] mplex_ampl_coords_list = [] ### pick primer pairs that meet the user defined conditions ### write bed files (for picked oligos) f8 = open(path + Time_stamp + "_" +'bed_separate_tracks_selected_oligos.bed', 'w') f8.write('browser position chr '+str(chr_no)+':'+ str(start_pos)+'-'+str(stop_pos)+'\n') f8.write('track name="Primers" description="Primers on separate tracks" visibility=2 colorByStrand="255,0,0 0,0,255"' + '\n') Gfor_subsetting=nx.Graph() primer_pair_counter = 0 for (nodeId, data_f) in G.nodes(data=True): if data_f['strand'] == "+": forward_start = data_f['p_start_pos'] forward_Tm = int(data_f['Tm']) forward_GC = int(data_f['GC']) forward_MaxMispTm = int(data_f['max_misprimeTm']) query_nodeId = nodeId for (nodeId, data_r) in G.nodes(data=True): if data_r['strand'] == "-": amplicon_len = (data_r['p_stop_pos']- forward_start)+1 reverse_Tm = int(data_r['Tm']) reverse_GC = int(data_r['GC']) reverse_MaxMispTm = int(data_r['max_misprimeTm']) ### check if primer pair meets user defined amplicon length conditon if amplicon_len >= amplicon_size_min and amplicon_len <= amplicon_size_max: ### check if primer pair meets user defined pair_misprimeTm_diff condition if min(forward_Tm, reverse_Tm) - max(forward_MaxMispTm, reverse_MaxMispTm) >= pair_misprimeTm_diff: ### check if primer pair meets user defined pair_Tm_diff and pair_GC_diff conditions if abs(forward_Tm-reverse_Tm) <= pair_Tm_diff: ### check if primer pair meets user defined between primer interaction_dg conditions interaction_dg = heterodimer_dg(query_nodeId, nodeId, mv_cation=monovalent_cation_eq,primer_conc=primer_conc) if interaction_dg >= min_misprime_dg: amplicon_len_list.append(amplicon_len) ### get the expected amplicon sequence amplicon_seq = spf.seq_extraction_loci(locus, start_pos, data_f['p_start_pos'], data_r['p_stop_pos']) ### eliminate amplicons with gaps if amplicon_gap_filter == 1: if 'N'*100 in amplicon_seq: break amplicon_gc = spf.gc_content(amplicon_seq.upper()) ### check if amplicons have gaps and indels if 'N'*100 in amplicon_seq: gaps = 'Yes' else: gaps = 'No' if 'n' in amplicon_seq: indel = 'Yes' else: indel = 'No' f_primer_length = len(query_nodeId) r_primer_length = len(nodeId) primer_name_f = "TA_" + str(chr_no) + "_" + str(data_f['p_start_pos']) + "_" + str(f_primer_length) + "_F" primer_name_r = "TA_" + str(chr_no) + "_" + str(data_r['p_stop_pos']) + "_" + str(r_primer_length) + "_R" ### now naming of r primers uses 5' end coords primer_pair_coords.append((data_f['p_start_pos'], data_r['p_stop_pos'])) if query_nodeId not in unique_primers: f8.write('chr'+ str(chr_no)+'\t'+ str(data_f['p_start_pos'])+'\t'+ str(data_f['p_stop_pos'])+'\t'+ str(primer_name_f)+'\t'+ str(0) +'\t'+ "+"+'\n') elif nodeId not in unique_primers: f8.write('chr'+ str(chr_no)+'\t'+ str(data_r['p_start_pos'])+'\t'+ str(data_r['p_stop_pos'])+'\t'+ str(primer_name_r)+'\t'+ str(0) +'\t'+ "-"+'\n') unique_primers[query_nodeId] = [str(chr_no), str(data_f['p_start_pos']), str(data_f['p_stop_pos']), str(data_f['Tm']), \ str(data_f['max_misprimeTm']), str(data_f['GC']), str(data_f['strand'])] unique_primers[nodeId] = [str(chr_no), str(data_r['p_start_pos']), str(data_r['p_stop_pos']), str(data_r['Tm']), \ str(data_r['max_misprimeTm']), str(data_r['GC']), str(data_r['strand'])] amplicon_coords = (data_f['p_start_pos'], data_r['p_stop_pos']) # nodes for creation of new network for subsetting connected components amplicon_coords_list.append(amplicon_coords) ### info for multiplex primer picking ### select primers within the given Tm range if select multiplex primer select option is given in the parameters file if multiplex_primers == 1 and data_f['Tm'] >= multiplex_Tm_min and data_r['Tm'] >= multiplex_Tm_min and data_f['Tm'] <= multiplex_Tm_max and data_r['Tm'] <= multiplex_Tm_max: f_primer_info = {'primer_name':primer_name_f, 'primer_sequence':query_nodeId, 'strand':data_f['strand'], 'p_start_pos':data_f['p_start_pos'], 'p_stop_pos':data_f['p_stop_pos'], 'Tm':data_f['Tm'], 'max_misprimeTm':data_f['max_misprimeTm'], 'GC':data_f['GC']} r_primer_info = {'primer_name':primer_name_r, 'primer_sequence':nodeId, 'strand':data_r['strand'], 'p_start_pos':data_r['p_start_pos'], 'p_stop_pos':data_r['p_stop_pos'], 'Tm':data_r['Tm'], 'max_misprimeTm':data_r['max_misprimeTm'], 'GC':data_r['GC']} amplicon_info = {'interaction_dg':interaction_dg, 'amplicon_len':amplicon_len, 'amplicon_gc':amplicon_gc,'gaps':gaps, 'indel':indel, 'amplicon_seq':amplicon_seq} Gfor_subsetting.add_node(amplicon_coords, f_primer_info = f_primer_info, r_primer_info = r_primer_info, amplicon_info = amplicon_info) mplex_ampl_coords = (data_f['p_start_pos'], data_r['p_stop_pos']) mplex_ampl_coords_list.append(mplex_ampl_coords) primer_pair_counter += 1 output_primer_pairs.write(str(primer_pair_counter) + '\t' + str(primer_name_f)+'\t'+str(query_nodeId) +'\t'+ str(data_f['strand']) +'\t'+ str(data_f['p_start_pos']) +'\t'+ str(data_f['p_stop_pos']) +'\t'+ str(data_f['Tm'])+'\t'+ str(data_f['max_misprimeTm'])+'\t'+ str(data_f['GC']) +'\t'+ str(interaction_dg)+'\t'+ str(amplicon_len) +'\t'+ str(amplicon_gc)+'\t'+ str(gaps)+'\t'+ str(indel)+'\t'+ str(amplicon_seq)+'\n') output_primer_pairs.write(str(primer_pair_counter) + '\t' + str(primer_name_r )+'\t'+str(nodeId)+'\t'+ str(data_r['strand'])+'\t'+ str(data_r['p_stop_pos']) +'\t'+ str(data_r['p_start_pos']) +'\t'+ str(data_r['Tm'])+'\t'+ str(data_r['max_misprimeTm'])+'\t'+ str(data_r['GC']) +'\t'+ '-'+'\t'+ '-' +'\t'+ '-' +'\t'+ '-' +'\t'+ '-' +'\t'+ '-'+'\n') order_primer_pairs.write(str(primer_name_f )+'\t'+ str(query_nodeId) +'\t'+ str(data_f['Tm'])+'\n') order_primer_pairs.write(str(primer_name_r)+'\t'+ str(nodeId) +'\t'+ str(data_r['Tm']) +'\n') f8.close() output_primer_pairs.close() order_primer_pairs.close() no_unique_primers_picked = len(unique_primers) return {'mplex_ampl_coords_list': mplex_ampl_coords_list, 'Gfor_subsetting': Gfor_subsetting, 'amplicon_len_list': amplicon_len_list, 'amplicon_coords_list':amplicon_coords_list, 'primer_pair_coords': primer_pair_coords,\ 'no_unique_primers_picked': no_unique_primers_picked}
def pick_primer_pairs(input_file): G = nx.Graph() primer_list = [] d_coords_plus = {} d_coords_minus = {} counter = 0 f = open(input_file) csv_f = csv.DictReader(f, delimiter=',') for row in csv_f: primer, p_start_pos, p_stop_pos, strand, Tm, max_misprimeTm, GC = row['Primer'], int(row['Genome_start']), int(row['Genome_stop']), row['Strand'], \ float(row['Primer_Tm']), (row['Max_misprime_Tm']), float(row['Primer_GC']) if max_misprimeTm == "3prime_mismatch": max_misprimeTm = 0 else: max_misprimeTm = float(max_misprimeTm) if Tm - max_misprimeTm >= pair_misprimeTm_diff: primer_list.append(primer) G.add_node(primer, p_start_pos=p_start_pos, p_stop_pos=p_stop_pos, strand=strand, Tm=Tm, max_misprimeTm=max_misprimeTm, GC=GC) unique_primers = {} primer_pair_coords = [] amplicon_len_list = [] amplicon_coords_list = [] mplex_ampl_coords_list = [] ### pick primer pairs that meet the user defined conditions ### write bed files (for picked oligos) f8 = open( path + Time_stamp + "_" + 'bed_separate_tracks_selected_oligos.bed', 'w') f8.write('browser position chr ' + str(chr_no) + ':' + str(start_pos) + '-' + str(stop_pos) + '\n') f8.write( 'track name="Primers" description="Primers on separate tracks" visibility=2 colorByStrand="255,0,0 0,0,255"' + '\n') Gfor_subsetting = nx.Graph() primer_pair_counter = 0 for (nodeId, data_f) in G.nodes(data=True): if data_f['strand'] == "+": forward_start = data_f['p_start_pos'] forward_Tm = int(data_f['Tm']) forward_GC = int(data_f['GC']) forward_MaxMispTm = int(data_f['max_misprimeTm']) query_nodeId = nodeId for (nodeId, data_r) in G.nodes(data=True): if data_r['strand'] == "-": amplicon_len = (data_r['p_stop_pos'] - forward_start) + 1 reverse_Tm = int(data_r['Tm']) reverse_GC = int(data_r['GC']) reverse_MaxMispTm = int(data_r['max_misprimeTm']) ### check if primer pair meets user defined amplicon length conditon if amplicon_len >= amplicon_size_min and amplicon_len <= amplicon_size_max: ### check if primer pair meets user defined pair_misprimeTm_diff condition if min(forward_Tm, reverse_Tm) - max( forward_MaxMispTm, reverse_MaxMispTm) >= pair_misprimeTm_diff: ### check if primer pair meets user defined pair_Tm_diff and pair_GC_diff conditions if abs(forward_Tm - reverse_Tm) <= pair_Tm_diff: ### check if primer pair meets user defined between primer interaction_dg conditions interaction_dg = heterodimer_dg( query_nodeId, nodeId, mv_cation=monovalent_cation_eq, primer_conc=primer_conc) if interaction_dg >= min_misprime_dg: amplicon_len_list.append(amplicon_len) ### get the expected amplicon sequence amplicon_seq = spf.seq_extraction_loci( locus, start_pos, data_f['p_start_pos'], data_r['p_stop_pos']) ### eliminate amplicons with gaps if amplicon_gap_filter == 1: if 'N' * 100 in amplicon_seq: break amplicon_gc = spf.gc_content( amplicon_seq.upper()) ### check if amplicons have gaps and indels if 'N' * 100 in amplicon_seq: gaps = 'Yes' else: gaps = 'No' if 'n' in amplicon_seq: indel = 'Yes' else: indel = 'No' f_primer_length = len(query_nodeId) r_primer_length = len(nodeId) primer_name_f = "TA_" + str( chr_no) + "_" + str( data_f['p_start_pos']) + "_" + str( f_primer_length) + "_F" primer_name_r = "TA_" + str( chr_no ) + "_" + str( data_r['p_stop_pos'] ) + "_" + str( r_primer_length ) + "_R" ### now naming of r primers uses 5' end coords primer_pair_coords.append( (data_f['p_start_pos'], data_r['p_stop_pos'])) if query_nodeId not in unique_primers: f8.write('chr' + str(chr_no) + '\t' + str(data_f['p_start_pos']) + '\t' + str(data_f['p_stop_pos']) + '\t' + str(primer_name_f) + '\t' + str(0) + '\t' + "+" + '\n') elif nodeId not in unique_primers: f8.write('chr' + str(chr_no) + '\t' + str(data_r['p_start_pos']) + '\t' + str(data_r['p_stop_pos']) + '\t' + str(primer_name_r) + '\t' + str(0) + '\t' + "-" + '\n') unique_primers[query_nodeId] = [str(chr_no), str(data_f['p_start_pos']), str(data_f['p_stop_pos']), str(data_f['Tm']), \ str(data_f['max_misprimeTm']), str(data_f['GC']), str(data_f['strand'])] unique_primers[nodeId] = [str(chr_no), str(data_r['p_start_pos']), str(data_r['p_stop_pos']), str(data_r['Tm']), \ str(data_r['max_misprimeTm']), str(data_r['GC']), str(data_r['strand'])] amplicon_coords = ( data_f['p_start_pos'], data_r['p_stop_pos'] ) # nodes for creation of new network for subsetting connected components amplicon_coords_list.append( amplicon_coords) ### info for multiplex primer picking ### select primers within the given Tm range if select multiplex primer select option is given in the parameters file if multiplex_primers == 1 and data_f[ 'Tm'] >= multiplex_Tm_min and data_r[ 'Tm'] >= multiplex_Tm_min and data_f[ 'Tm'] <= multiplex_Tm_max and data_r[ 'Tm'] <= multiplex_Tm_max: f_primer_info = { 'primer_name': primer_name_f, 'primer_sequence': query_nodeId, 'strand': data_f['strand'], 'p_start_pos': data_f['p_start_pos'], 'p_stop_pos': data_f['p_stop_pos'], 'Tm': data_f['Tm'], 'max_misprimeTm': data_f['max_misprimeTm'], 'GC': data_f['GC'] } r_primer_info = { 'primer_name': primer_name_r, 'primer_sequence': nodeId, 'strand': data_r['strand'], 'p_start_pos': data_r['p_start_pos'], 'p_stop_pos': data_r['p_stop_pos'], 'Tm': data_r['Tm'], 'max_misprimeTm': data_r['max_misprimeTm'], 'GC': data_r['GC'] } amplicon_info = { 'interaction_dg': interaction_dg, 'amplicon_len': amplicon_len, 'amplicon_gc': amplicon_gc, 'gaps': gaps, 'indel': indel, 'amplicon_seq': amplicon_seq } Gfor_subsetting.add_node( amplicon_coords, f_primer_info=f_primer_info, r_primer_info=r_primer_info, amplicon_info=amplicon_info) mplex_ampl_coords = ( data_f['p_start_pos'], data_r['p_stop_pos']) mplex_ampl_coords_list.append( mplex_ampl_coords) primer_pair_counter += 1 output_primer_pairs.write( str(primer_pair_counter) + '\t' + str(primer_name_f) + '\t' + str(query_nodeId) + '\t' + str(data_f['strand']) + '\t' + str(data_f['p_start_pos']) + '\t' + str(data_f['p_stop_pos']) + '\t' + str(data_f['Tm']) + '\t' + str(data_f['max_misprimeTm']) + '\t' + str(data_f['GC']) + '\t' + str(interaction_dg) + '\t' + str(amplicon_len) + '\t' + str(amplicon_gc) + '\t' + str(gaps) + '\t' + str(indel) + '\t' + str(amplicon_seq) + '\n') output_primer_pairs.write( str(primer_pair_counter) + '\t' + str(primer_name_r) + '\t' + str(nodeId) + '\t' + str(data_r['strand']) + '\t' + str(data_r['p_stop_pos']) + '\t' + str(data_r['p_start_pos']) + '\t' + str(data_r['Tm']) + '\t' + str(data_r['max_misprimeTm']) + '\t' + str(data_r['GC']) + '\t' + '-' + '\t' + '-' + '\t' + '-' + '\t' + '-' + '\t' + '-' + '\t' + '-' + '\n') order_primer_pairs.write( str(primer_name_f) + '\t' + str(query_nodeId) + '\t' + str(data_f['Tm']) + '\n') order_primer_pairs.write( str(primer_name_r) + '\t' + str(nodeId) + '\t' + str(data_r['Tm']) + '\n') f8.close() output_primer_pairs.close() order_primer_pairs.close() no_unique_primers_picked = len(unique_primers) return {'mplex_ampl_coords_list': mplex_ampl_coords_list, 'Gfor_subsetting': Gfor_subsetting, 'amplicon_len_list': amplicon_len_list, 'amplicon_coords_list':amplicon_coords_list, 'primer_pair_coords': primer_pair_coords,\ 'no_unique_primers_picked': no_unique_primers_picked}
def PickPrimerPair(inputfile): G = nx.Graph() primerList = [] d_coords_plus = {} d_coords_minus = {} primer_pair_counter = 0 f = open(inputfile) locus = outdir + "sequence.txt" output_primer_pairs = open(outdir + "PPS_primer_pairs_info.txt", "w") output_primer_pairs.write('Primer_pair#'+'\t'+'Primer_name'+'\t'+'Primer_seq'+'\t'+ 'Strand'+'\t'+ '5prime_pos' +'\t'+ '3prime_pos' +'\t'+ 'Tm' +'\t'+ \ 'Max_misprimeTm'+'\t'+ 'GC' +'\t'+ 'Primer_dimer_dG' +'\t'+ 'Amplicon_size' +'\t'+ 'Amplicon_GC' +'\t'+ 'Gaps'+'\t'+ 'Polymorphisms'+'\t'+ 'Amplicon_seq'+'\n') order_primer_pairs = open(outdir + "_" + "primer_pairs_order.txt", "w") order_primer_pairs.write('Primer_name' + '\t' + 'Primer_seq' + '\t' + 'Tm' + '\n') csv_f = csv.DictReader(f, delimiter=',') for row in csv_f: id_p, primer, start, end, strand, Tm, max_misprimeTm, GC , Continuous_GC = row['id'], row['Primer'], int(row['start']), \ int(row['stop']), row['Strand'], float(row['Primer_Tm']), row['Max_misprime_Tm'], float(row['Primer_GC']), int(row['Continuous_GC']) if max_misprimeTm == "3prime_mismatch": max_misprimeTm = 0 else: max_misprimeTm = float(max_misprimeTm) if Tm - max_misprimeTm >= pair_misprimeTm_diff: primerList.append(id_p) G.add_node(id_p, p_chrom=id_p.split('_')[2], p_start_pos=int(id_p.split('_')[3]), p_stop_pos=int(id_p.split('_')[3])+int(id_p.split('_')[4]), \ strand=strand, Tm=Tm, max_misprimeTm=max_misprimeTm, GC=GC, Continuous_GC=Continuous_GC) unique_primers = {} primer_pair_coords = [] amplicon_len_list = [] amplicon_coords_list = [] mplex_ampl_coords_list = [] f8 = open(outdir + 'bed_separate_tracks_selected_oligos.bed', 'w') #f8.write('browser position chr '+str(chr_no)+':'+ str(start_pos)+'-'+str(stop_pos)+'\n') f8.write( 'track name="Primers" description="Primers on separate tracks" visibility=2 colorByStrand="255,0,0 0,0,255"' + '\n') Gfor_subsetting = nx.Graph() for (nodeId, data) in G.nodes(data=True): '''data ------ {'p_chrom': 'chr17', 'p_start_pos': 7565168, 'Tm': 65.84, 'GC': 52.17, 'p_stop_pos': 7565191, 'max_misprimeTm': 54.02, 'strand': '+'} nodeId ------'TA_1_chr17_7565168_23_65.84_+_TCCCTGGTTAAGAGATCCTCCTG' ''' if data['strand'] == "+": forward_chrom = data['p_chrom'] forward_start = data['p_start_pos'] forward_stop = data['p_stop_pos'] amplicon_start = int(forward_stop) amplicon_end = int(forward_start) + 120 forward_Tm = int(data['Tm']) forward_GC = int(data['GC']) forward_MaxMispTm = int(data['max_misprimeTm']) #query_nodeId = nodeId id_f = nodeId.split('_')[1] f_primer_length = len(nodeId.split('_')[-1]) primer_name_f = "TA_" + str( id_f) + "_" + forward_chrom + "_" + str( data['p_start_pos']) + "_" + str(f_primer_length) + "_F" if forward_Tm - forward_MaxMispTm >= pair_misprimeTm_diff: amplicon_seq = seq_extraction_loci(locus, id_f, amplicon_start, amplicon_end) if amplicon_seq == '': continue if amplicon_gap_filter == 1: if 'N' * 100 in amplicon_seq: continue amplicon_gc = spf.gc_content(amplicon_seq.upper()) if 'N' * 100 in amplicon_seq: gaps = 'Yes' else: gaps = 'No' if 'n' in amplicon_seq: indel = 'Yes' else: indel = 'No' amplicon_coords = (id_f, forward_chrom, data['p_stop_pos'], amplicon_end) amplicon_coords_list.append(amplicon_coords) if data['Tm'] >= multiplex_Tm_min and data[ 'Tm'] <= multiplex_Tm_max and data[ 'Continuous_GC'] <= Continus_GC_num: primer_info = {'primer_chr':forward_chrom, 'primer_name':primer_name_f, 'primer_sequence':nodeId, \ 'strand':data['strand'], 'p_start_pos':data['p_start_pos'], 'p_stop_pos':data['p_stop_pos'], \ 'Tm':data['Tm'], 'max_misprimeTm':data['max_misprimeTm'], 'GC':data['GC']} amplicon_info = { 'amplicon_gc': amplicon_gc, 'gaps': gaps, 'indel': indel, 'amplicon_seq': amplicon_seq } Gfor_subsetting.add_node(amplicon_coords, id_f=id_f, primer_info=primer_info, amplicon_info=amplicon_info) mplex_ampl_coords = (id_f, forward_chrom, data['p_stop_pos'], amplicon_end) mplex_ampl_coords_list.append(mplex_ampl_coords) primer_pair_counter += 1 output_primer_pairs.write(str(primer_pair_counter) + '\t' + str(primer_name_f)+'\t'+str(nodeId.split("_")[-1]) +'\t'+ \ str(data['strand']) +'\t'+ str(data['p_start_pos']) +'\t'+ str(data['p_stop_pos']) +'\t'+ str(data['Tm'])+'\t'+ \ str(data['max_misprimeTm'])+'\t'+ str(data['GC']) +'\t'+ \ str(amplicon_gc)+'\t'+ str(gaps)+'\t'+ str(indel)+'\t'+ str(amplicon_seq)+'\n') if data['strand'] == "-": reverse_chrom = data['p_chrom'] reverse_start = data['p_start_pos'] reverse_stop = data['p_stop_pos'] amplicon_start = int(reverse_start) - 120 amplicon_end = int(reverse_start) id_r = nodeId.split('_')[1] reverse_Tm = int(data['Tm']) reverse_GC = int(data['GC']) reverse_MaxMispTm = int(data['max_misprimeTm']) r_primer_length = len(nodeId.split('_')[-1]) primer_name_r = "TA_" + str( id_r) + "_" + reverse_chrom + "_" + str( data['p_start_pos']) + "_" + str(r_primer_length) + "_R" if reverse_Tm - reverse_MaxMispTm >= pair_misprimeTm_diff: amplicon_seq = seq_extraction_loci(locus, id_r, amplicon_start, amplicon_end) if amplicon_seq == '': continue if amplicon_gap_filter == 1: if 'N' * 100 in amplicon_seq: continue amplicon_gc = spf.gc_content(amplicon_seq.upper()) if 'N' * 100 in amplicon_seq: gaps = 'Yes' else: gaps = 'No' if 'n' in amplicon_seq: indel = 'Yes' else: indel = 'No' amplicon_coords = (id_r, reverse_chrom, amplicon_start, amplicon_end) amplicon_coords_list.append(amplicon_coords) if data['Tm'] >= multiplex_Tm_min and data[ 'Tm'] <= multiplex_Tm_max and data[ 'Continuous_GC'] <= Continus_GC_num: primer_info = {'primer_chr':reverse_chrom, 'primer_name':primer_name_r, 'primer_sequence':nodeId, \ 'strand':data['strand'], 'p_start_pos':data['p_start_pos'], 'p_stop_pos':data['p_stop_pos'], \ 'Tm':data['Tm'], 'max_misprimeTm':data['max_misprimeTm'], 'GC':data['GC']} amplicon_info = { 'amplicon_gc': amplicon_gc, 'gaps': gaps, 'indel': indel, 'amplicon_seq': amplicon_seq } Gfor_subsetting.add_node(amplicon_coords, id_r=id_r, primer_info=primer_info, amplicon_info=amplicon_info) mplex_ampl_coords = (id_r, reverse_chrom, amplicon_start, data['p_start_pos']) mplex_ampl_coords_list.append(mplex_ampl_coords) primer_pair_counter += 1 output_primer_pairs.write(str(primer_pair_counter) + '\t' + str(primer_name_r)+'\t'+str(nodeId.split("_")[-1]) +'\t'+ \ str(data['strand']) +'\t'+ str(data['p_start_pos']) +'\t'+ str(data['p_stop_pos']) +'\t'+ str(data['Tm'])+'\t'+ \ str(data['max_misprimeTm'])+'\t'+ str(data['GC']) +'\t'+ \ str(amplicon_gc)+'\t'+ str(gaps)+'\t'+ str(indel)+'\t'+ str(amplicon_seq)+'\n') #primer_length = len(nodeId.split('_')[-1]) f8.close() output_primer_pairs.close() order_primer_pairs.close() no_unique_primers_picked = len(unique_primers) return { 'mplex_ampl_coords_list': mplex_ampl_coords_list, 'Gfor_subsetting': Gfor_subsetting, 'amplicon_coords_list': amplicon_coords_list, 'primer_pair_counter': primer_pair_counter }