Example #1
0
def variant_mask(chr_no, start_pos, stop_pos, variant_path, refDB_path,
                 write_path, time_stamp, variant_mask_condition):
    indel_count = 0
    snp_count = 0
    filename = "chr" + str(chr_no)
    output_poly_masked = open(
        write_path + str(time_stamp) + "_" + str(chr_no) + "_" +
        str(start_pos) + "_" + str((stop_pos - start_pos) + 1) +
        "_VariantMasked.fasta", "w")
    output_poly_masked.write(
        str(">" + "TA_" + str(chr_no) + "_" + str(start_pos) + "_" +
            str((stop_pos - start_pos) + 1) + "_" + "VariantMasked" + '\n'))
    Locus = get_Locus(start_pos, stop_pos, chr_no, refDB_path)
    gap_count = Locus.count('N' * 100)
    locus_gc = gc_content(Locus)
    if variant_mask_condition == 1:
        df = pd.read_csv(variant_path + filename, sep='\t', header=0)
        df = df[(df.coordinate >= int(start_pos))
                & (df.coordinate <= int(stop_pos))]
        for index, row in df.iterrows():
            if '<INS>' in row['alternate_allele'] or '<DEL>' in row[
                    'alternate_allele']:
                poly_coord = int(row['coordinate']) - (start_pos - 1)
                Locus = Locus[:(poly_coord - 1)] + "n" + Locus[
                    poly_coord:]  # Indels annotated as "n"
                indel_count += 1
            else:
                poly_coord = int(row['coordinate']) - (start_pos - 1)
                Locus = Locus[:(poly_coord - 1)] + "N" + Locus[
                    poly_coord:]  # Polymorphisms other than indels annotated as "N"
                snp_count += 1
    output_poly_masked.write(str(Locus))
    output_poly_masked.close()
    return [indel_count, snp_count, gap_count, locus_gc]
Example #2
0
def primer_filter (primer, minTm, maxTm,  GC_range_min,GC_range_max, self_Tmdiff ,filter_AT_3prime, primer_dict, i, strand,  filter_di_si_repeats,  filter_GC_clamp):
        if check_compostion(primer):

            primer_GC    =    gc_content(primer)        
            if primer_GC >= GC_range_min and primer_GC <= GC_range_max:
            
                primer_end_filter = check_ATends(primer, filter_AT_3prime)
                if primer_end_filter =="noATend_filter":            
                            
                    primer_GC_clamp_filter = check_GC_clamp(primer, filter_GC_clamp )
                    if primer_GC_clamp_filter =="noGC_clamp_filter":            
                    
                        primer_di_single_repeat_filter    =    di_single_nucleo_repeat_filter(primer, filter_di_si_repeats = filter_di_si_repeats, di_si_repeats_threshold = di_si_repeats_threshold)
                        if primer_di_single_repeat_filter    ==0:
                    
                            primer_Tm    = float(NN_Tm(seq=primer, compl_seq=complement(primer), primer_conc=primer_conc, Na=Na, K=K, Tris=Tris, Mg=Mg, dNTPs=dNTPs, ion_corr=True))        
                            if primer_Tm >=minTm and primer_Tm <=maxTm:                
                        
                                Tm_hairpin            =    float(hairpin_Tm(primer, monovalent_cation_eq, primer_conc))
                                Tm_homodimer    =    float(homodimer_Tm(primer, monovalent_cation_eq, primer_conc))
                                
                                if (primer_Tm - Tm_hairpin ) >=  float(self_Tmdiff) and (primer_Tm - Tm_homodimer ) >=  float(self_Tmdiff) :                                   
                                    if primer not in primer_dict:            
                                        primer_dict[primer] = [i, (primer_size), 1, primer_Tm, strand]
                                    else:
                                        primer_dict[primer][2] +=1
Example #3
0
def primer_filter(primer, minTm, maxTm, GC_range_min, GC_range_max,
                  self_Tmdiff, filter_AT_3prime, primer_dict, i, strand,
                  filter_di_si_repeats, filter_GC_clamp):
    if check_compostion(primer):

        primer_GC = gc_content(primer)
        if primer_GC >= GC_range_min and primer_GC <= GC_range_max:

            primer_end_filter = check_ATends(primer, filter_AT_3prime)
            if primer_end_filter == "noATend_filter":

                primer_GC_clamp_filter = check_GC_clamp(
                    primer, filter_GC_clamp)
                if primer_GC_clamp_filter == "noGC_clamp_filter":

                    primer_di_single_repeat_filter = di_single_nucleo_repeat_filter(
                        primer,
                        filter_di_si_repeats=filter_di_si_repeats,
                        di_si_repeats_threshold=di_si_repeats_threshold)
                    if primer_di_single_repeat_filter == 0:

                        primer_Tm = float(
                            NN_Tm(seq=primer,
                                  compl_seq=complement(primer),
                                  primer_conc=primer_conc,
                                  Na=Na,
                                  K=K,
                                  Tris=Tris,
                                  Mg=Mg,
                                  dNTPs=dNTPs,
                                  ion_corr=True))
                        if primer_Tm >= minTm and primer_Tm <= maxTm:

                            Tm_hairpin = float(
                                hairpin_Tm(primer, monovalent_cation_eq,
                                           primer_conc))
                            Tm_homodimer = float(
                                homodimer_Tm(primer, monovalent_cation_eq,
                                             primer_conc))

                            if (primer_Tm -
                                    Tm_hairpin) >= float(self_Tmdiff) and (
                                        primer_Tm -
                                        Tm_homodimer) >= float(self_Tmdiff):
                                if primer not in primer_dict:
                                    primer_dict[primer] = [
                                        i, (primer_size), 1, primer_Tm, strand
                                    ]
                                else:
                                    primer_dict[primer][2] += 1
Example #4
0
def variant_mask(chr_no, start_pos, stop_pos, variant_path, refDB_path, write_path, time_stamp, variant_mask_condition):
    indel_count = 0
    snp_count   = 0
    filename = "chr" + str(chr_no)                                                              
    output_poly_masked = open(write_path + str(time_stamp)+"_"+ str(chr_no) + "_"+ str(start_pos) + "_"+ str((stop_pos-start_pos)+1) + "_VariantMasked.fasta", "w")
    output_poly_masked.write(str(">"+"TA_"+ str(chr_no) + "_"+ str(start_pos) + "_"+ str((stop_pos-start_pos)+1) + "_"+"VariantMasked"+'\n'))
    Locus = get_Locus(start_pos, stop_pos, chr_no, refDB_path)
    gap_count = Locus.count('N'*100)
    locus_gc = gc_content(Locus)
    if variant_mask_condition == 1:
        df = pd.read_csv(variant_path+filename, sep='\t', header=0) 
        df = df[(df.coordinate >= int(start_pos)) & (df.coordinate <= int(stop_pos))]
        for index, row in df.iterrows():
            if '<INS>' in row['alternate_allele'] or '<DEL>' in row['alternate_allele']:
                poly_coord    =    int(row['coordinate'])-(start_pos-1)
                Locus = Locus[:(poly_coord-1)] + "n" + Locus[poly_coord:]                  # Indels annotated as "n"
                indel_count += 1
            else :
                poly_coord    =    int(row['coordinate'])-(start_pos-1)
                Locus = Locus[:(poly_coord-1)] + "N" + Locus[poly_coord:]                  # Polymorphisms other than indels annotated as "N"
                snp_count += 1
    output_poly_masked.write(str(Locus))
    output_poly_masked.close()
    return [indel_count, snp_count, gap_count, locus_gc]
Example #5
0
    f0 = open(os.devnull, 'w')
    sp.call(["makeblastdb","-in","%s" %locus, "-dbtype","nucl","-parse_seqids","-out","%sloci_db" %query_path], stdout=f0,stderr=f0)

    ### Exact match to loci to get coords
    p5 = sp.Popen(["blastn","-db","%sloci_db" %query_path,"-query","%s" %fasta_input_file_FR,"-evalue","0.1","-word_size","%s" %word_size,"-gapopen","0","-gapextend","2","-reward","1","-penalty","-3","-dust","no","-perc_identity","100","-max_target_seqs", "13","-outfmt","10 qseqid length sstart send", "-num_threads","%s" %mp_num_threads],stdout=sp.PIPE,stderr=f0)
    exact_match_output_pooled_primers, error = p5.communicate()

    ### final pooled oligo coordinate output file
    f5 = open(query_path + path + Time_stamp + "_" + 'PSE_out3_1.csv','w')
    f5.write("Primer"+','+"Loci_start"+','+"Loci_stop"+','+"Genome_start"+','+"Genome_stop"+','+"Strand"+','+"Primer_Tm"+','+"Max_misprime_Tm"+','+"Tm_difference"+','+'Misprime_Tm_'+str(misprime_Tm_percentile_value)+'th_percentile'+','+"Primer_GC"+','+"Continuous_GC"+','+"3'_region_mismatches"+','+"Hairpin_Tm"+','+"Homodimer_Tm"+'\n')                            ### changed header name to Max_misprime_Tm

    for exact_match_pooled_output_line in exact_match_output_pooled_primers.split('\n')[:-1]:
        exact_match_pooled_output_line = exact_match_pooled_output_line.strip(' ').split(',')
        Primer_pooled        =    exact_match_pooled_output_line[0]
        primer_gc            =    gc_content(Primer_pooled)
        primer_continuous_gc=    continuous_gc(Primer_pooled)
        query_len_pooled    =    len(Primer_pooled)
        match_len_pooled    =    int(exact_match_pooled_output_line[1])
        sstart                 =    int(exact_match_pooled_output_line[2])
        sstart                =    (actual_locus_start_pos + sstart) - 1
        send                =    int(exact_match_pooled_output_line[3])
        send                =    (actual_locus_start_pos + send) - 1
        if query_len_pooled == match_len_pooled:
            Primer_Tm_p            =    pooled_primer_f_r_dict[Primer_pooled][0]
            Max_misprime_Tm_p    =    pooled_primer_f_r_dict[Primer_pooled][1]
            Tm_difference_p        =    pooled_primer_f_r_dict[Primer_pooled][2]
            three_prime_region_mismatches        =    pooled_primer_f_r_dict[Primer_pooled][3]
            misprime_Tm_percentile = pooled_primer_f_r_dict[Primer_pooled][4]
            Hairpin_Tm            = pooled_primer_f_r_dict[Primer_pooled][5]
            Homodimer_Tm        = pooled_primer_f_r_dict[Primer_pooled][6]
Example #6
0
def pick_primer_pairs(input_file):
    G=nx.Graph()
    primer_list =   []
    d_coords_plus   = {}
    d_coords_minus  = {}
    counter         = 0
    f = open(input_file)
    csv_f = csv.DictReader(f, delimiter=',')
    for row in csv_f:
        primer, p_start_pos, p_stop_pos, strand, Tm, max_misprimeTm, GC  = row['Primer'], int(row['Genome_start']), int(row['Genome_stop']), row['Strand'], \
        float(row['Primer_Tm']), (row['Max_misprime_Tm']), float(row['Primer_GC'])
        if max_misprimeTm == "3prime_mismatch":
            max_misprimeTm=0
        else:
            max_misprimeTm=float(max_misprimeTm)
            
        if Tm - max_misprimeTm >= pair_misprimeTm_diff:
            primer_list.append(primer)
            G.add_node(primer, p_start_pos=p_start_pos, p_stop_pos=p_stop_pos, strand=strand, Tm=Tm, max_misprimeTm=max_misprimeTm, GC=GC)
    unique_primers          =   {}
    primer_pair_coords      =   []
    amplicon_len_list       =   []
    amplicon_coords_list    =   []
    mplex_ampl_coords_list  =   []
    ### pick primer pairs that meet the user defined conditions
    ### write bed files (for picked oligos)
    f8 = open(path + Time_stamp + "_" +'bed_separate_tracks_selected_oligos.bed', 'w')
    f8.write('browser position chr '+str(chr_no)+':'+ str(start_pos)+'-'+str(stop_pos)+'\n')
    f8.write('track name="Primers" description="Primers on separate tracks" visibility=2 colorByStrand="255,0,0 0,0,255"' + '\n')
    Gfor_subsetting=nx.Graph()
    primer_pair_counter = 0
    for (nodeId, data_f) in G.nodes(data=True):
        if data_f['strand']   == "+":
            forward_start   =   data_f['p_start_pos']
            forward_Tm      =   int(data_f['Tm'])
            forward_GC      =   int(data_f['GC'])
            forward_MaxMispTm      =   int(data_f['max_misprimeTm'])
            query_nodeId    =   nodeId
            for (nodeId, data_r) in G.nodes(data=True):
                if data_r['strand']   == "-":
                    amplicon_len = (data_r['p_stop_pos']- forward_start)+1
                    reverse_Tm      =   int(data_r['Tm'])
                    reverse_GC      =   int(data_r['GC'])
                    reverse_MaxMispTm      =   int(data_r['max_misprimeTm'])
                    ### check if primer pair meets user defined amplicon length conditon
                    if amplicon_len >= amplicon_size_min and amplicon_len <= amplicon_size_max:
                        ### check if primer pair meets user defined pair_misprimeTm_diff condition
                        if min(forward_Tm, reverse_Tm) - max(forward_MaxMispTm, reverse_MaxMispTm) >= pair_misprimeTm_diff:
                            ### check if primer pair meets user defined pair_Tm_diff and pair_GC_diff conditions
                            if abs(forward_Tm-reverse_Tm) <= pair_Tm_diff:
                                ### check if primer pair meets user defined between primer interaction_dg conditions
                                interaction_dg  =    heterodimer_dg(query_nodeId, nodeId, mv_cation=monovalent_cation_eq,primer_conc=primer_conc)
                                if interaction_dg    >=  min_misprime_dg:
                                    amplicon_len_list.append(amplicon_len)
                                    ### get the expected amplicon sequence
                                    amplicon_seq    =    spf.seq_extraction_loci(locus, start_pos, data_f['p_start_pos'], data_r['p_stop_pos'])                               
                                                         
                                    ### eliminate amplicons with gaps
                                    if amplicon_gap_filter ==   1:
                                        if 'N'*100 in amplicon_seq:
                                            break
                                    
                                    amplicon_gc    =    spf.gc_content(amplicon_seq.upper()) 
                                    
                                    ### check if amplicons have gaps and indels
                                    if 'N'*100 in amplicon_seq:
                                        gaps    =   'Yes'
                                    else:
                                        gaps    =   'No'
                                    if 'n' in amplicon_seq:
                                        indel    =   'Yes'
                                    else:
                                        indel    =   'No'
                                    f_primer_length =   len(query_nodeId)    
                                    r_primer_length =   len(nodeId)
                                    primer_name_f     =   "TA_" + str(chr_no) + "_" + str(data_f['p_start_pos']) + "_" + str(f_primer_length) + "_F"
                                    primer_name_r     =   "TA_" + str(chr_no) + "_" + str(data_r['p_stop_pos']) + "_" + str(r_primer_length) + "_R"          ### now naming of r primers uses 5' end coords
                                    primer_pair_coords.append((data_f['p_start_pos'], data_r['p_stop_pos']))
                                    if query_nodeId not in unique_primers:
                                        f8.write('chr'+ str(chr_no)+'\t'+ str(data_f['p_start_pos'])+'\t'+ str(data_f['p_stop_pos'])+'\t'+ str(primer_name_f)+'\t'+ str(0) +'\t'+ "+"+'\n') 
                                    elif nodeId not in unique_primers:
                                        f8.write('chr'+ str(chr_no)+'\t'+ str(data_r['p_start_pos'])+'\t'+ str(data_r['p_stop_pos'])+'\t'+ str(primer_name_r)+'\t'+ str(0) +'\t'+ "-"+'\n') 
                                    unique_primers[query_nodeId]    =   [str(chr_no), str(data_f['p_start_pos']), str(data_f['p_stop_pos']), str(data_f['Tm']), \
                                    str(data_f['max_misprimeTm']), str(data_f['GC']), str(data_f['strand'])]
                                    unique_primers[nodeId]          =   [str(chr_no), str(data_r['p_start_pos']), str(data_r['p_stop_pos']), str(data_r['Tm']), \
                                    str(data_r['max_misprimeTm']), str(data_r['GC']), str(data_r['strand'])]
                                    amplicon_coords    = (data_f['p_start_pos'], data_r['p_stop_pos'])                                                     # nodes for creation of new network for subsetting connected components
                                    amplicon_coords_list.append(amplicon_coords) 
                                    ### info for multiplex primer picking
                                    ### select primers within the given Tm range if select multiplex primer select option is given in the parameters file
                                    if multiplex_primers   == 1 and data_f['Tm'] >= multiplex_Tm_min and data_r['Tm'] >= multiplex_Tm_min and data_f['Tm'] <= multiplex_Tm_max and data_r['Tm'] <= multiplex_Tm_max:
                                            f_primer_info   =   {'primer_name':primer_name_f, 'primer_sequence':query_nodeId, 'strand':data_f['strand'], 'p_start_pos':data_f['p_start_pos'], 'p_stop_pos':data_f['p_stop_pos'], 'Tm':data_f['Tm'], 'max_misprimeTm':data_f['max_misprimeTm'], 'GC':data_f['GC']}
                                            r_primer_info   =   {'primer_name':primer_name_r, 'primer_sequence':nodeId, 'strand':data_r['strand'], 'p_start_pos':data_r['p_start_pos'], 'p_stop_pos':data_r['p_stop_pos'], 'Tm':data_r['Tm'], 'max_misprimeTm':data_r['max_misprimeTm'], 'GC':data_r['GC']}
                                            amplicon_info   =   {'interaction_dg':interaction_dg, 'amplicon_len':amplicon_len, 'amplicon_gc':amplicon_gc,'gaps':gaps, 'indel':indel, 'amplicon_seq':amplicon_seq}
                                            Gfor_subsetting.add_node(amplicon_coords, f_primer_info = f_primer_info, r_primer_info = r_primer_info, amplicon_info = amplicon_info)
                                            mplex_ampl_coords       = (data_f['p_start_pos'], data_r['p_stop_pos'])     
                                            mplex_ampl_coords_list.append(mplex_ampl_coords)
                                    primer_pair_counter += 1
                                    output_primer_pairs.write(str(primer_pair_counter) + '\t' + str(primer_name_f)+'\t'+str(query_nodeId)  +'\t'+ str(data_f['strand']) +'\t'+ str(data_f['p_start_pos']) +'\t'+ str(data_f['p_stop_pos']) +'\t'+ str(data_f['Tm'])+'\t'+ str(data_f['max_misprimeTm'])+'\t'+ str(data_f['GC']) +'\t'+ str(interaction_dg)+'\t'+ str(amplicon_len) +'\t'+ str(amplicon_gc)+'\t'+ str(gaps)+'\t'+ str(indel)+'\t'+ str(amplicon_seq)+'\n')
                                    output_primer_pairs.write(str(primer_pair_counter) + '\t' + str(primer_name_r )+'\t'+str(nodeId)+'\t'+ str(data_r['strand'])+'\t'+ str(data_r['p_stop_pos']) +'\t'+ str(data_r['p_start_pos']) +'\t'+ str(data_r['Tm'])+'\t'+ str(data_r['max_misprimeTm'])+'\t'+ str(data_r['GC']) +'\t'+ '-'+'\t'+ '-'  +'\t'+ '-' +'\t'+ '-' +'\t'+ '-' +'\t'+ '-'+'\n')
                                    order_primer_pairs.write(str(primer_name_f )+'\t'+ str(query_nodeId) +'\t'+ str(data_f['Tm'])+'\n')
                                    order_primer_pairs.write(str(primer_name_r)+'\t'+ str(nodeId) +'\t'+ str(data_r['Tm']) +'\n') 
    f8.close()
    output_primer_pairs.close()                               
    order_primer_pairs.close()                               
    no_unique_primers_picked    =   len(unique_primers)                               
    return {'mplex_ampl_coords_list': mplex_ampl_coords_list, 'Gfor_subsetting': Gfor_subsetting, 'amplicon_len_list': amplicon_len_list, 'amplicon_coords_list':amplicon_coords_list, 'primer_pair_coords': primer_pair_coords,\
    'no_unique_primers_picked': no_unique_primers_picked}
Example #7
0
def pick_primer_pairs(input_file):
    G = nx.Graph()
    primer_list = []
    d_coords_plus = {}
    d_coords_minus = {}
    counter = 0
    f = open(input_file)
    csv_f = csv.DictReader(f, delimiter=',')
    for row in csv_f:
        primer, p_start_pos, p_stop_pos, strand, Tm, max_misprimeTm, GC  = row['Primer'], int(row['Genome_start']), int(row['Genome_stop']), row['Strand'], \
        float(row['Primer_Tm']), (row['Max_misprime_Tm']), float(row['Primer_GC'])
        if max_misprimeTm == "3prime_mismatch":
            max_misprimeTm = 0
        else:
            max_misprimeTm = float(max_misprimeTm)

        if Tm - max_misprimeTm >= pair_misprimeTm_diff:
            primer_list.append(primer)
            G.add_node(primer,
                       p_start_pos=p_start_pos,
                       p_stop_pos=p_stop_pos,
                       strand=strand,
                       Tm=Tm,
                       max_misprimeTm=max_misprimeTm,
                       GC=GC)
    unique_primers = {}
    primer_pair_coords = []
    amplicon_len_list = []
    amplicon_coords_list = []
    mplex_ampl_coords_list = []
    ### pick primer pairs that meet the user defined conditions
    ### write bed files (for picked oligos)
    f8 = open(
        path + Time_stamp + "_" + 'bed_separate_tracks_selected_oligos.bed',
        'w')
    f8.write('browser position chr ' + str(chr_no) + ':' + str(start_pos) +
             '-' + str(stop_pos) + '\n')
    f8.write(
        'track name="Primers" description="Primers on separate tracks" visibility=2 colorByStrand="255,0,0 0,0,255"'
        + '\n')
    Gfor_subsetting = nx.Graph()
    primer_pair_counter = 0
    for (nodeId, data_f) in G.nodes(data=True):
        if data_f['strand'] == "+":
            forward_start = data_f['p_start_pos']
            forward_Tm = int(data_f['Tm'])
            forward_GC = int(data_f['GC'])
            forward_MaxMispTm = int(data_f['max_misprimeTm'])
            query_nodeId = nodeId
            for (nodeId, data_r) in G.nodes(data=True):
                if data_r['strand'] == "-":
                    amplicon_len = (data_r['p_stop_pos'] - forward_start) + 1
                    reverse_Tm = int(data_r['Tm'])
                    reverse_GC = int(data_r['GC'])
                    reverse_MaxMispTm = int(data_r['max_misprimeTm'])
                    ### check if primer pair meets user defined amplicon length conditon
                    if amplicon_len >= amplicon_size_min and amplicon_len <= amplicon_size_max:
                        ### check if primer pair meets user defined pair_misprimeTm_diff condition
                        if min(forward_Tm, reverse_Tm) - max(
                                forward_MaxMispTm,
                                reverse_MaxMispTm) >= pair_misprimeTm_diff:
                            ### check if primer pair meets user defined pair_Tm_diff and pair_GC_diff conditions
                            if abs(forward_Tm - reverse_Tm) <= pair_Tm_diff:
                                ### check if primer pair meets user defined between primer interaction_dg conditions
                                interaction_dg = heterodimer_dg(
                                    query_nodeId,
                                    nodeId,
                                    mv_cation=monovalent_cation_eq,
                                    primer_conc=primer_conc)
                                if interaction_dg >= min_misprime_dg:
                                    amplicon_len_list.append(amplicon_len)
                                    ### get the expected amplicon sequence
                                    amplicon_seq = spf.seq_extraction_loci(
                                        locus, start_pos,
                                        data_f['p_start_pos'],
                                        data_r['p_stop_pos'])

                                    ### eliminate amplicons with gaps
                                    if amplicon_gap_filter == 1:
                                        if 'N' * 100 in amplicon_seq:
                                            break

                                    amplicon_gc = spf.gc_content(
                                        amplicon_seq.upper())

                                    ### check if amplicons have gaps and indels
                                    if 'N' * 100 in amplicon_seq:
                                        gaps = 'Yes'
                                    else:
                                        gaps = 'No'
                                    if 'n' in amplicon_seq:
                                        indel = 'Yes'
                                    else:
                                        indel = 'No'
                                    f_primer_length = len(query_nodeId)
                                    r_primer_length = len(nodeId)
                                    primer_name_f = "TA_" + str(
                                        chr_no) + "_" + str(
                                            data_f['p_start_pos']) + "_" + str(
                                                f_primer_length) + "_F"
                                    primer_name_r = "TA_" + str(
                                        chr_no
                                    ) + "_" + str(
                                        data_r['p_stop_pos']
                                    ) + "_" + str(
                                        r_primer_length
                                    ) + "_R"  ### now naming of r primers uses 5' end coords
                                    primer_pair_coords.append(
                                        (data_f['p_start_pos'],
                                         data_r['p_stop_pos']))
                                    if query_nodeId not in unique_primers:
                                        f8.write('chr' + str(chr_no) + '\t' +
                                                 str(data_f['p_start_pos']) +
                                                 '\t' +
                                                 str(data_f['p_stop_pos']) +
                                                 '\t' + str(primer_name_f) +
                                                 '\t' + str(0) + '\t' + "+" +
                                                 '\n')
                                    elif nodeId not in unique_primers:
                                        f8.write('chr' + str(chr_no) + '\t' +
                                                 str(data_r['p_start_pos']) +
                                                 '\t' +
                                                 str(data_r['p_stop_pos']) +
                                                 '\t' + str(primer_name_r) +
                                                 '\t' + str(0) + '\t' + "-" +
                                                 '\n')
                                    unique_primers[query_nodeId]    =   [str(chr_no), str(data_f['p_start_pos']), str(data_f['p_stop_pos']), str(data_f['Tm']), \
                                    str(data_f['max_misprimeTm']), str(data_f['GC']), str(data_f['strand'])]
                                    unique_primers[nodeId]          =   [str(chr_no), str(data_r['p_start_pos']), str(data_r['p_stop_pos']), str(data_r['Tm']), \
                                    str(data_r['max_misprimeTm']), str(data_r['GC']), str(data_r['strand'])]
                                    amplicon_coords = (
                                        data_f['p_start_pos'],
                                        data_r['p_stop_pos']
                                    )  # nodes for creation of new network for subsetting connected components
                                    amplicon_coords_list.append(
                                        amplicon_coords)
                                    ### info for multiplex primer picking
                                    ### select primers within the given Tm range if select multiplex primer select option is given in the parameters file
                                    if multiplex_primers == 1 and data_f[
                                            'Tm'] >= multiplex_Tm_min and data_r[
                                                'Tm'] >= multiplex_Tm_min and data_f[
                                                    'Tm'] <= multiplex_Tm_max and data_r[
                                                        'Tm'] <= multiplex_Tm_max:
                                        f_primer_info = {
                                            'primer_name':
                                            primer_name_f,
                                            'primer_sequence':
                                            query_nodeId,
                                            'strand':
                                            data_f['strand'],
                                            'p_start_pos':
                                            data_f['p_start_pos'],
                                            'p_stop_pos':
                                            data_f['p_stop_pos'],
                                            'Tm':
                                            data_f['Tm'],
                                            'max_misprimeTm':
                                            data_f['max_misprimeTm'],
                                            'GC':
                                            data_f['GC']
                                        }
                                        r_primer_info = {
                                            'primer_name':
                                            primer_name_r,
                                            'primer_sequence':
                                            nodeId,
                                            'strand':
                                            data_r['strand'],
                                            'p_start_pos':
                                            data_r['p_start_pos'],
                                            'p_stop_pos':
                                            data_r['p_stop_pos'],
                                            'Tm':
                                            data_r['Tm'],
                                            'max_misprimeTm':
                                            data_r['max_misprimeTm'],
                                            'GC':
                                            data_r['GC']
                                        }
                                        amplicon_info = {
                                            'interaction_dg': interaction_dg,
                                            'amplicon_len': amplicon_len,
                                            'amplicon_gc': amplicon_gc,
                                            'gaps': gaps,
                                            'indel': indel,
                                            'amplicon_seq': amplicon_seq
                                        }
                                        Gfor_subsetting.add_node(
                                            amplicon_coords,
                                            f_primer_info=f_primer_info,
                                            r_primer_info=r_primer_info,
                                            amplicon_info=amplicon_info)
                                        mplex_ampl_coords = (
                                            data_f['p_start_pos'],
                                            data_r['p_stop_pos'])
                                        mplex_ampl_coords_list.append(
                                            mplex_ampl_coords)
                                    primer_pair_counter += 1
                                    output_primer_pairs.write(
                                        str(primer_pair_counter) + '\t' +
                                        str(primer_name_f) + '\t' +
                                        str(query_nodeId) + '\t' +
                                        str(data_f['strand']) + '\t' +
                                        str(data_f['p_start_pos']) + '\t' +
                                        str(data_f['p_stop_pos']) + '\t' +
                                        str(data_f['Tm']) + '\t' +
                                        str(data_f['max_misprimeTm']) + '\t' +
                                        str(data_f['GC']) + '\t' +
                                        str(interaction_dg) + '\t' +
                                        str(amplicon_len) + '\t' +
                                        str(amplicon_gc) + '\t' + str(gaps) +
                                        '\t' + str(indel) + '\t' +
                                        str(amplicon_seq) + '\n')
                                    output_primer_pairs.write(
                                        str(primer_pair_counter) + '\t' +
                                        str(primer_name_r) + '\t' +
                                        str(nodeId) + '\t' +
                                        str(data_r['strand']) + '\t' +
                                        str(data_r['p_stop_pos']) + '\t' +
                                        str(data_r['p_start_pos']) + '\t' +
                                        str(data_r['Tm']) + '\t' +
                                        str(data_r['max_misprimeTm']) + '\t' +
                                        str(data_r['GC']) + '\t' + '-' + '\t' +
                                        '-' + '\t' + '-' + '\t' + '-' + '\t' +
                                        '-' + '\t' + '-' + '\n')
                                    order_primer_pairs.write(
                                        str(primer_name_f) + '\t' +
                                        str(query_nodeId) + '\t' +
                                        str(data_f['Tm']) + '\n')
                                    order_primer_pairs.write(
                                        str(primer_name_r) + '\t' +
                                        str(nodeId) + '\t' +
                                        str(data_r['Tm']) + '\n')
    f8.close()
    output_primer_pairs.close()
    order_primer_pairs.close()
    no_unique_primers_picked = len(unique_primers)
    return {'mplex_ampl_coords_list': mplex_ampl_coords_list, 'Gfor_subsetting': Gfor_subsetting, 'amplicon_len_list': amplicon_len_list, 'amplicon_coords_list':amplicon_coords_list, 'primer_pair_coords': primer_pair_coords,\
    'no_unique_primers_picked': no_unique_primers_picked}
Example #8
0
def PickPrimerPair(inputfile):
    G = nx.Graph()

    primerList = []
    d_coords_plus = {}
    d_coords_minus = {}
    primer_pair_counter = 0

    f = open(inputfile)
    locus = outdir + "sequence.txt"
    output_primer_pairs = open(outdir + "PPS_primer_pairs_info.txt", "w")
    output_primer_pairs.write('Primer_pair#'+'\t'+'Primer_name'+'\t'+'Primer_seq'+'\t'+ 'Strand'+'\t'+ '5prime_pos' +'\t'+ '3prime_pos' +'\t'+ 'Tm' +'\t'+ \
    'Max_misprimeTm'+'\t'+ 'GC' +'\t'+ 'Primer_dimer_dG' +'\t'+ 'Amplicon_size' +'\t'+ 'Amplicon_GC' +'\t'+ 'Gaps'+'\t'+ 'Polymorphisms'+'\t'+ 'Amplicon_seq'+'\n')

    order_primer_pairs = open(outdir + "_" + "primer_pairs_order.txt", "w")
    order_primer_pairs.write('Primer_name' + '\t' + 'Primer_seq' + '\t' +
                             'Tm' + '\n')

    csv_f = csv.DictReader(f, delimiter=',')

    for row in csv_f:
        id_p, primer, start, end, strand, Tm, max_misprimeTm, GC , Continuous_GC = row['id'], row['Primer'], int(row['start']), \
                int(row['stop']), row['Strand'], float(row['Primer_Tm']), row['Max_misprime_Tm'], float(row['Primer_GC']), int(row['Continuous_GC'])
        if max_misprimeTm == "3prime_mismatch":
            max_misprimeTm = 0
        else:
            max_misprimeTm = float(max_misprimeTm)
        if Tm - max_misprimeTm >= pair_misprimeTm_diff:
            primerList.append(id_p)
            G.add_node(id_p, p_chrom=id_p.split('_')[2], p_start_pos=int(id_p.split('_')[3]), p_stop_pos=int(id_p.split('_')[3])+int(id_p.split('_')[4]), \
                    strand=strand, Tm=Tm, max_misprimeTm=max_misprimeTm, GC=GC, Continuous_GC=Continuous_GC)

    unique_primers = {}
    primer_pair_coords = []
    amplicon_len_list = []
    amplicon_coords_list = []
    mplex_ampl_coords_list = []

    f8 = open(outdir + 'bed_separate_tracks_selected_oligos.bed', 'w')
    #f8.write('browser position chr '+str(chr_no)+':'+ str(start_pos)+'-'+str(stop_pos)+'\n')
    f8.write(
        'track name="Primers" description="Primers on separate tracks" visibility=2 colorByStrand="255,0,0 0,0,255"'
        + '\n')

    Gfor_subsetting = nx.Graph()

    for (nodeId, data) in G.nodes(data=True):
        '''data    ------ {'p_chrom': 'chr17', 'p_start_pos': 7565168, 'Tm': 65.84, 'GC': 52.17, 'p_stop_pos': 7565191, 'max_misprimeTm': 54.02, 'strand': '+'}
           nodeId  ------'TA_1_chr17_7565168_23_65.84_+_TCCCTGGTTAAGAGATCCTCCTG'
        '''
        if data['strand'] == "+":
            forward_chrom = data['p_chrom']
            forward_start = data['p_start_pos']
            forward_stop = data['p_stop_pos']
            amplicon_start = int(forward_stop)
            amplicon_end = int(forward_start) + 120
            forward_Tm = int(data['Tm'])
            forward_GC = int(data['GC'])
            forward_MaxMispTm = int(data['max_misprimeTm'])

            #query_nodeId    =   nodeId
            id_f = nodeId.split('_')[1]
            f_primer_length = len(nodeId.split('_')[-1])
            primer_name_f = "TA_" + str(
                id_f) + "_" + forward_chrom + "_" + str(
                    data['p_start_pos']) + "_" + str(f_primer_length) + "_F"

            if forward_Tm - forward_MaxMispTm >= pair_misprimeTm_diff:
                amplicon_seq = seq_extraction_loci(locus, id_f, amplicon_start,
                                                   amplicon_end)
                if amplicon_seq == '':
                    continue
                if amplicon_gap_filter == 1:
                    if 'N' * 100 in amplicon_seq:
                        continue

                    amplicon_gc = spf.gc_content(amplicon_seq.upper())

                    if 'N' * 100 in amplicon_seq:
                        gaps = 'Yes'
                    else:
                        gaps = 'No'
                    if 'n' in amplicon_seq:
                        indel = 'Yes'
                    else:
                        indel = 'No'

                amplicon_coords = (id_f, forward_chrom, data['p_stop_pos'],
                                   amplicon_end)
                amplicon_coords_list.append(amplicon_coords)

                if data['Tm'] >= multiplex_Tm_min and data[
                        'Tm'] <= multiplex_Tm_max and data[
                            'Continuous_GC'] <= Continus_GC_num:
                    primer_info   =   {'primer_chr':forward_chrom, 'primer_name':primer_name_f, 'primer_sequence':nodeId, \
                                       'strand':data['strand'], 'p_start_pos':data['p_start_pos'], 'p_stop_pos':data['p_stop_pos'], \
                                       'Tm':data['Tm'], 'max_misprimeTm':data['max_misprimeTm'], 'GC':data['GC']}
                    amplicon_info = {
                        'amplicon_gc': amplicon_gc,
                        'gaps': gaps,
                        'indel': indel,
                        'amplicon_seq': amplicon_seq
                    }

                    Gfor_subsetting.add_node(amplicon_coords,
                                             id_f=id_f,
                                             primer_info=primer_info,
                                             amplicon_info=amplicon_info)
                    mplex_ampl_coords = (id_f, forward_chrom,
                                         data['p_stop_pos'], amplicon_end)
                    mplex_ampl_coords_list.append(mplex_ampl_coords)

                    primer_pair_counter += 1
                    output_primer_pairs.write(str(primer_pair_counter) + '\t' + str(primer_name_f)+'\t'+str(nodeId.split("_")[-1])  +'\t'+ \
                                          str(data['strand']) +'\t'+ str(data['p_start_pos']) +'\t'+ str(data['p_stop_pos']) +'\t'+ str(data['Tm'])+'\t'+ \
                                          str(data['max_misprimeTm'])+'\t'+ str(data['GC'])  +'\t'+ \
                                          str(amplicon_gc)+'\t'+ str(gaps)+'\t'+ str(indel)+'\t'+ str(amplicon_seq)+'\n')

        if data['strand'] == "-":
            reverse_chrom = data['p_chrom']
            reverse_start = data['p_start_pos']
            reverse_stop = data['p_stop_pos']
            amplicon_start = int(reverse_start) - 120
            amplicon_end = int(reverse_start)
            id_r = nodeId.split('_')[1]

            reverse_Tm = int(data['Tm'])
            reverse_GC = int(data['GC'])
            reverse_MaxMispTm = int(data['max_misprimeTm'])
            r_primer_length = len(nodeId.split('_')[-1])
            primer_name_r = "TA_" + str(
                id_r) + "_" + reverse_chrom + "_" + str(
                    data['p_start_pos']) + "_" + str(r_primer_length) + "_R"

            if reverse_Tm - reverse_MaxMispTm >= pair_misprimeTm_diff:
                amplicon_seq = seq_extraction_loci(locus, id_r, amplicon_start,
                                                   amplicon_end)
                if amplicon_seq == '':
                    continue
                if amplicon_gap_filter == 1:
                    if 'N' * 100 in amplicon_seq:
                        continue

                    amplicon_gc = spf.gc_content(amplicon_seq.upper())

                    if 'N' * 100 in amplicon_seq:
                        gaps = 'Yes'
                    else:
                        gaps = 'No'
                    if 'n' in amplicon_seq:
                        indel = 'Yes'
                    else:
                        indel = 'No'

                amplicon_coords = (id_r, reverse_chrom, amplicon_start,
                                   amplicon_end)
                amplicon_coords_list.append(amplicon_coords)

                if data['Tm'] >= multiplex_Tm_min and data[
                        'Tm'] <= multiplex_Tm_max and data[
                            'Continuous_GC'] <= Continus_GC_num:
                    primer_info   =   {'primer_chr':reverse_chrom, 'primer_name':primer_name_r, 'primer_sequence':nodeId, \
                                       'strand':data['strand'], 'p_start_pos':data['p_start_pos'], 'p_stop_pos':data['p_stop_pos'], \
                                       'Tm':data['Tm'], 'max_misprimeTm':data['max_misprimeTm'], 'GC':data['GC']}
                    amplicon_info = {
                        'amplicon_gc': amplicon_gc,
                        'gaps': gaps,
                        'indel': indel,
                        'amplicon_seq': amplicon_seq
                    }

                    Gfor_subsetting.add_node(amplicon_coords,
                                             id_r=id_r,
                                             primer_info=primer_info,
                                             amplicon_info=amplicon_info)
                    mplex_ampl_coords = (id_r, reverse_chrom, amplicon_start,
                                         data['p_start_pos'])
                    mplex_ampl_coords_list.append(mplex_ampl_coords)

                    primer_pair_counter += 1
                    output_primer_pairs.write(str(primer_pair_counter) + '\t' + str(primer_name_r)+'\t'+str(nodeId.split("_")[-1])  +'\t'+ \
                                          str(data['strand']) +'\t'+ str(data['p_start_pos']) +'\t'+ str(data['p_stop_pos']) +'\t'+ str(data['Tm'])+'\t'+ \
                                          str(data['max_misprimeTm'])+'\t'+ str(data['GC'])  +'\t'+ \
                                          str(amplicon_gc)+'\t'+ str(gaps)+'\t'+ str(indel)+'\t'+ str(amplicon_seq)+'\n')

        #primer_length   =   len(nodeId.split('_')[-1])

    f8.close()
    output_primer_pairs.close()
    order_primer_pairs.close()
    no_unique_primers_picked = len(unique_primers)

    return {
        'mplex_ampl_coords_list': mplex_ampl_coords_list,
        'Gfor_subsetting': Gfor_subsetting,
        'amplicon_coords_list': amplicon_coords_list,
        'primer_pair_counter': primer_pair_counter
    }