Ejemplo n.º 1
0
def GC_analyzer(vis_flag, Seq, supth, infth):
    # fn=""
    # fasta=Fasta(fn)
    size = 20
    if vis_flag:
        Seq_Analyzer(Seq).GC_window_analyzer_visual(size, supth, infth)
    _GC_ls, _tmp = Seq_Analyzer(Seq).GC_window_analyzer(size, supth, infth)
    print min(_GC_ls), max(_GC_ls), numpy.mean(_GC_ls), numpy.std(_GC_ls)
Ejemplo n.º 2
0
def search_tilling_primer(seg, ref, Tm, minLen, maxLen, disp, name, seq, n,
                          length):
    f_primer, r_primer = Seq_Analyzer(seg).Find_Primer_at_Ends_Tm(
        L_res=2 * disp, R_res=2 * disp, Tm=Tm, minLen=minLen, maxLen=maxLen)
    score = 10.
    best_p = [-1, -1]
    for i in range(len(f_primer)):
        fp = f_primer[i]
        if off_target_binding(seq=fp[0], ref=ref, rc=False):
            continue
        pos_sc = 2 * ((fp[2] - disp) / disp)**2
        GC_sc = (fp[3] - .5)**2
        _sc = pos_sc + GC_sc
        if _sc < score:
            score = _sc
            best_p[0] = i
    if best_p[0] == -1:
        print "%s_%d: Foward Primer Not FOUND at Tm=%d!\n" % (name, n, Tm)
        return [], []
    score = 10.
    fp = f_primer[i]
    for i in range(len(r_primer)):
        rp = r_primer[i]
        if off_target_binding(seq=rp[0], ref=ref, rc=True):
            continue
        if rp[2] < fp[2] + 100:
            # minimal product > 100 bp
            continue
        pos_sc = ((fp[2] - rp[2] - length) / length)**2
        GC_sc = (rp[3] - .5)**2
        _sc = pos_sc + GC_sc
        if _sc < score:
            score = _sc
            best_p[1] = i
    if best_p[1] == -1:
        print "%s_%d: Reverse Primer FOUND at Tm=%d!\n" % (name, n, Tm)
        return [], []
    else:
        fp = f_primer[best_p[0]]
        rp = r_primer[best_p[1]]
        f_primer_seq = fp[0]
        r_primer_seq = rp[0]
        f_primer_name = "%s_%d_F" % (name, n)
        print f_primer_name
        r_primer_name = "%s_%d_R" % (name, n)
        f_primer_pos = seq.find(f_primer_seq) + 1
        r_primer_pos = seq.find(Seq_Analyzer(r_primer_seq).rcSeq()) + 1
        f_primer_Tm = str(fp[1])
        r_primer_Tm = str(rp[1])
        f_primer_GC = str(fp[3])
        r_primer_GC = str(rp[3])
        f_primer = [
            f_primer_name, f_primer_seq, f_primer_Tm, f_primer_pos, f_primer_GC
        ]
        r_primer = [
            r_primer_name, r_primer_seq, r_primer_Tm, r_primer_pos, r_primer_GC
        ]
    return f_primer, r_primer
Ejemplo n.º 3
0
def analyze_re_site(seq, re_ls, re_ls_II):
    """Export list of RE restriction sites.

    REQUIRE load_enzymes to provide RE list

    Currently only report the 1st binding site of re.
    Note that TypeIIs will have RE site out of the binding site.
    Returns list of (1) Name of REs
                    (2) Sites in acscending seq
    """

    re_name_ls = []
    re_site_ls = []

    for re in re_ls:
        re_name_ls.append(re[0])
        seq = seq + seq[:len(re[1])]  # For Circular plasmid
        seq = seq.upper()
        site_ls = [i for i in range(len(seq)) if seq.startswith(re[1], i)]
        re_site_ls.append(site_ls)

    for re in re_ls_II:
        re_name_ls.append(re[0])
        re_site_1 = re[1]
        re_site_2 = Seq_Analyzer(re_site_1).rcSeq()
        site_ls_1 = [
            i for i in range(len(seq)) if seq.startswith(re_site_1, i)
        ]
        site_ls_2 = [
            i for i in range(len(seq)) if seq.startswith(re_site_2, i)
        ]
        site_ls = merge_ascend_ls(site_ls_1, site_ls_2)
        re_site_ls.append(site_ls)
    return re_name_ls, re_site_ls
Ejemplo n.º 4
0
def linkerPCR_primers():
    """ Make Linker PCR Primers."""
    chunks = "yeast_chr01_chunks.FA"
    dirn = "/Users/xuz02/Google_Drive/workspace/Python/150218_Leslie/"
    bacbone = "pZX4_lin.fa"
    output = "liner_primers.csv"
    v_fasta = Fasta(dirn + bacbone)
    c_fasta = Fasta(dirn + chunks)
    o_fp = open(dirn + output, "w")
    left = 58
    right = 57
    reverse = "ggccggccccagcttttgttc"
    forward = "cggccggccctatagtgagtcg"
    o_fp.write("Name, Forward Primer, Reverse Primer\n")
    for n in range(len(c_fasta.Seqs)):
        f_primer = c_fasta.Seqs[n][-right:] + forward
        r_primer = Seq_Analyzer(c_fasta.Seqs[n][:left]).rcSeq() + reverse
        name = c_fasta.Names[n]
        fn = "pZX4_" + name[:19] + ".fasta"
        fp = open(dirn + fn, "w")
        fp.write(">%s\n" % name)
        fp.write(c_fasta.Seqs[n] + v_fasta.Seqs[0])
        fp.close()
        o_fp.write("%s, %s, %s\n" % (name[:19], f_primer, r_primer))
    o_fp.close()
Ejemplo n.º 5
0
def batch_PCR_Primers_at_End():

    folder = "/workspace/Python/170116_SynIV/minichunks/"
    savefile = "/workspace/Python/170116_SynIV/primers.csv"

    L_res = 2
    R_res = 2
    Tm_ls = range(52, 59)
    minLen = 20
    maxLen = 50
    primer_ls = []
    output = open(savefile, "w+")

    _len = len(Tm_ls)

    for fn in os.listdir(folder):
        if "fasta" in fn:
            fa = Fasta(folder + fn)
            length = len(fa.Seqs)
            for n in range(length):
                seq = fa.Seqs[n]
                name = fa.Names[n]
                F_primer, R_primer, Tm_bin = Seq_Analyzer(seq).\
                    Find_Primer_at_Ends(L_res, R_res, Tm_ls, minLen, maxLen)
                primer_ls.append([name, F_primer, R_primer, Tm_bin])

    for primer_sub_ls in primer_ls:
        for n in range(_len):
            if primer_sub_ls[3][n]:
                output.write(str(primer_sub_ls[0][:-1]) + ",")
                for i in [1, 2]:
                    for j in range(4):
                        output.write(str(primer_sub_ls[i][n][j]) + ",")
                output.write(str(primer_sub_ls[2][n][4]) + ",")
                output.write("\n")
Ejemplo n.º 6
0
def GC_tmp():
    dirn = "/Users/xuz02/Downloads/"
    fn = "dra_mt.fa"
    # for fn in os.listdir(dirn):
    # 	if fn[-3:].upper() != "TXT" :
    # 		continue
    fasta = Fasta(dirn + fn)
    n = len(fasta.Seqs)
    for i in range(n):
        seq = fasta.Seqs[i]
        name = fasta.Names[i]
        GC_content, outlets = [], []
        GC_content, outlets = Seq_Analyzer(seq).GC_window_analyzer(
            20, 0.70, 0.40)
        if len(outlets) > 0:
            Seq_Analyzer(seq).GC_window_analyzer_visual(20, 0.70, 0.30, name)
Ejemplo n.º 7
0
def chr04_pcrtag_stat():
    """Stat the pcrtags over the minichunks."""

    import re
    dirn = "/workspace/Python/161212_megachunk_csPCR/"
    primers = "PCRtags_syn.csv"
    mega = "synIV_mega.fa"
    output = "chr04_pcrtag_stat.csv"
    mega_f = Fasta(dirn + mega)
    primers_fp = open(dirn + primers, "r")
    op = open(dirn + output, "w")

    primer_ls = []
    csv = primers_fp.read()
    primer_ls = re.split("\r|,", csv)
    size = len(primer_ls) / 2
    stats = []
    for i in range(size):
        if "synF" in primer_ls[2 * i]:
            p_seq = primer_ls[2 * i + 1].upper()
        else:
            if "\n" not in primer_ls[2 * i + 1]:
                p_seq = Seq_Analyzer(primer_ls[2 * i + 1]).rcSeq().upper()
        for j in range(len(mega_f.Names)):
            sq = mega_f.Seqs[j].upper()
            if p_seq in sq:
                _pos = sq.find(p_seq)
                stats.append([
                    mega_f.Names[j], primer_ls[2 * i], primer_ls[2 * i], _pos
                ])
        print "%s done!" % primer_ls[2 * i]
    for item in stats:
        op.write("%s, %s, %s, %d\n" % (item[0], item[1], item[2], item[3]))
    op.close()
    mega_f.close()
Ejemplo n.º 8
0
def minichunk_for_Twist():
    dirn = "/Users/Zhuwei/Google_Drive/Project Data/ORDERS/Twist/"
    stat_name = "minichunk_stat.csv"
    GC_stat = []
    rep_stat = []
    stat_fn = open(dirn + stat_name, "w")
    fasta = Fasta("/Users/Zhuwei/Documents/Order_Twist_140607.fasta")
    for _n in range(fasta.size):
        _GC_ls = Seq_Analyzer(fasta.Seqs[_n]).GC_window_analyzer(100)
        print "pass GC\n"
        _mean = numpy.mean(_GC_ls)
        _min = min(_GC_ls)
        _max = max(_GC_ls)
        _std = numpy.std(_GC_ls)
        GC_stat.extend([fasta.Names[_n], _min, _max, _mean, _std])
        _rep_ls = Seq_Analyzer(fasta.Seqs[_n]).Repetitive_Seq_analyzer(3, 4, 2)
        print "pass rep\n"
        rep_stat.append(fasta.Names[_n])
        rep_stat.extend(_rep_ls)
        rep_stat.append(";;")
    # for fn in os.listdir(dirn):
    # 	if fn[-5:].upper() != "FASTA" and fn[-2:].upper() != "FA":
    # 		continue
    # 	fasta=Fasta(dirn+fn)
    # 	for _n in range(fasta.size):
    # 		_GC_ls=Seq_Analyzer(fasta.Seqs[_n]).GC_window_analyzer(100)
    # 		print"pass GC\n"
    # 		_mean=numpy.mean(_GC_ls)
    # 		_min=min(_GC_ls)
    # 		_max=max(_GC_ls)
    # 		_std=numpy.std(_GC_ls)
    # 		GC_stat.extend([fasta.Names[_n],_min,_max,_mean,_std])
    # 		_rep_ls=Seq_Analyzer(fasta.Seqs[_n]).Repetitive_Seq_analyzer(3,5,2)
    # 		print "pass rep\n"
    # 		rep_stat.append(fasta.Names[_n])
    # 		rep_stat.extend(_rep_ls)
    # 		rep_stat.append(";;")
    for _x in range(len(GC_stat)):
        stat_fn.write(str(GC_stat[_x]) + ";")
        if _x % 5 == 4:
            stat_fn.write("\n")
    for _x in rep_stat:
        if _x != ";;":
            stat_fn.write(str(_x) + ";")
        else:
            stat_fn.write(";\n")
Ejemplo n.º 9
0
 def findSeq(self):
     Seq = str(self.findSeqLine.text())
     print Seq
     if not str(self.currentSeq):
         return
     self.seqFindLst = Seq_Analyzer(self.currentSeq).Degen_searcher(Seq)
     _tmp_seq_str = ""
     for _pos in self.seqFindLst:
         _tmp_seq_str += (str(_pos) + " ; ")
     self.SeqEdit.hlightRegion(self.seqFindLst, len(Seq))
     self.SelectEdit.setText(_tmp_seq_str)
Ejemplo n.º 10
0
def Chara_analyzer():
    fasta = Fasta(
        "/Users/Zhuwei/Google_Drive/workspace/Python/141008_JIngchuan/\
        S288C_reference_genome_R64-1-1_20110203\
        /S288C_reference_sequence_R64-1-1_20110203.fsa")
    output = "/Users/Zhuwei/Google_Drive/workspace/Python/141008_Jingchuan/\
chara_stat_4.csv"

    fp = open(output, 'w')
    N = len(fasta.Names)
    word = {
        "C": ["C"],
        "G": ["G"],
        "A": ["A"],
        "T": ["T"],
        "CpG": ["CG"],
        "CpT": ["CT"],
        "ApG": ["AG"],
        "CpC": ["CC"],
        "GpG": ["GG"],
        "CpA": ["CA"],
        "TpG": ['TG'],
        "CCGG": ["CCGG"]
    }
    chara_sites = []
    for i in range(N):
        seq = Seq_Analyzer(fasta.Seqs[i])
        name = fasta.Names[i]
        for idx in word:
            word_lst = word[idx]
            count = 0
            for site in word_lst:
                site_lst = seq.Degen_searcher(site)
                count += len(site_lst)
                chara_sites.append([name, site, len(site_lst), site_lst])
            fp.write(name + "," + idx + "," + str(count) + "\n")
Ejemplo n.º 11
0
def off_target_binding(seq, ref, rc=False):
    seq_r = Seq_Analyzer(seq).rcSeq()
    if rc:
        if seq in ref:
            return True
        pos = ref.find(seq_r)
        ref = ref[:pos] + ref[pos + len(seq):]
        if seq_r in ref:
            return True
    else:
        if seq_r in ref:
            return True
        pos = ref.find(seq)
        ref = ref[:pos] + ref[pos + len(seq):]
        if seq in ref:
            return True
    if easy_blast(seq=seq, ref=ref):
        return True
    if easy_blast(seq=seq_r, ref=ref):
        return True
    return False
Ejemplo n.º 12
0
def minichunk_csPCR_primers_two_pairs(dirn,
                                      minichunk,
                                      chunk,
                                      Tm,
                                      ref="",
                                      Lim=150,
                                      pcr=300):
    """ Function to make junction primers for csPCR.

    NAME OF THE MINICHUNKS = CHUNKNAME.01-0X
    Two pairs of csPCR is generated
    dirn: working dir
    minichunk: name of the multi fasta file with all the minichunks
    ref: reference genome
    Tm : Expected Tm
    """
    # upper limit of the length of the PCR product at each site
    half_PCR_length = pcr
    m_fasta = Fasta(dirn + minichunk)
    ref_fasta = Fasta(dirn + ref)
    c_fasta = Fasta(dirn + chunk)

    output_fn = "csPCR_primers1.csv"
    output = open(dirn + output_fn, "w")

    first_minichunk = []
    second_minichunk = []
    third_minichunk = []

    num_minichunks = len(m_fasta.Seqs)
    for i in range(num_minichunks):
        print m_fasta.Names[i][-1]
        if m_fasta.Names[i][-1] == "1":
            first_minichunk.append([m_fasta.Seqs[i], m_fasta.Names[i]])
        elif m_fasta.Names[i][-1] == "2":
            second_minichunk.append([m_fasta.Seqs[i], m_fasta.Names[i]])
        elif m_fasta.Names[i][-1] == "3":
            third_minichunk.append([m_fasta.Seqs[i], m_fasta.Names[i]])
        else:
            continue
    if (len(first_minichunk) != len(second_minichunk)
            or len(first_minichunk) != len(third_minichunk)):
        print("Not enough minichunks to make 2 pairs of csPCR primers!\n")
        return

    # forward primers at minichunk #1
    # return arguments for Seq_Analyzer.Find_Forward_Primer = [seq, ]
    forward_12 = []
    for minichunk in first_minichunk:
        seq = minichunk[0][-half_PCR_length:]
        primer = []
        primer_name = minichunk[1][:-3] + "_F_junc12"
        primer.append(primer_name)
        primer.extend(
            Seq_Analyzer(seq).Find_Forward_Primer(Tm,
                                                  Lim,
                                                  crossref=ref_fasta.Seqs[0]))
        if len(primer) == 1:
            print("%s FAILED!\n" % (primer_name))
            primer.extend(["", -1, -1, -1, -1, -1])
        forward_12.append(primer)
    reverse_12 = []
    forward_23 = []
    for minichunk in second_minichunk:
        seqF = minichunk[0][-half_PCR_length:]
        seqR = minichunk[0][:half_PCR_length]
        primerF, primerR = [], []
        primer_name_F = minichunk[1][:-3] + "_F_junc23"
        primer_name_R = minichunk[1][:-3] + "_R_junc12"
        primerF.append(primer_name_F)
        primerR.append(primer_name_R)
        primerF.extend(
            Seq_Analyzer(seqF).Find_Forward_Primer(Tm,
                                                   Lim,
                                                   crossref=ref_fasta.Seqs[0]))
        primerR.extend(
            Seq_Analyzer(seqR).Find_Reverse_Primer(Tm,
                                                   Lim,
                                                   crossref=ref_fasta.Seqs[0]))
        if len(primerF) == 1:
            print("%s FAILED!\n" % (primer_name_F))
            primerF.extend(["", -1, -1, -1, -1, -1])
        if len(primerR) == 1:
            print("%s FAILED!\n" % (primer_name_R))
            primerR.extend(["", -1, -1, -1, -1, -1])
        reverse_12.append(primerR)
        forward_23.append(primerF)
    reverse_23 = []
    for minichunk in third_minichunk:
        seq = minichunk[0][:half_PCR_length]
        primer = []
        primer_name = minichunk[1][:-3] + "_R_junc23"
        primer.append(primer_name)
        primer.extend(
            Seq_Analyzer(seq).Find_Reverse_Primer(Tm,
                                                  Lim,
                                                  crossref=ref_fasta.Seqs[0]))
        if len(primer) == 1:
            print("%s FAILED!\n" % (primer_name))
            primer.extend(["", -1, -1, -1, -1, -1])
        reverse_23.append(primer)
    # Verify the PCR products
    size = len(forward_12)
    if size != len(c_fasta.Seqs):
        print("# of reference chunks(%d) != # of primer request (%d)" %
              (len(c_fasta.Seqs), size))
        return
    refseq = ref_fasta.Seqs[0]

    output.write("Name, Sequence, 5' at minichunk, length, GC ratio\
 , flag_GC, flag_multi, Tm, 5' at chunk, 5' at genome, PCR at chunk\
 , PCR at genome\n")
    for n in range(size):
        pf12 = forward_12[n]
        pr12 = reverse_12[n]
        pf23 = forward_23[n]
        pr23 = reverse_23[n]
        chunkseq = c_fasta.Seqs[n]
        loc_f12 = chunkseq.find(pf12[1]) + 1
        loc_f23 = chunkseq.find(pf23[1]) + 1

        loc_r12 = chunkseq.find(Seq_Analyzer(pr12[1]).rcSeq()) + len(pr12[1])
        loc_r23 = chunkseq.find(Seq_Analyzer(pr23[1]).rcSeq()) + len(pr23[1])

        gen_f12 = refseq.find(pf12[1]) + 1
        gen_f23 = refseq.find(pf23[1]) + 1
        gen_r12 = refseq.find(Seq_Analyzer(pr12[1]).rcSeq()) + len(pr12[1])
        gen_r23 = refseq.find(Seq_Analyzer(pr23[1]).rcSeq()) + len(pr23[1])

        len_pcr_12 = loc_r12 - loc_f12
        len_pcr_23 = loc_r23 - loc_f23

        len_gen_12 = gen_r12 - gen_f12
        len_gen_23 = gen_r23 - gen_f23

        pf12.extend([loc_f12, gen_f12, len_pcr_12, len_gen_12])
        pf23.extend([loc_f23, gen_f23, len_pcr_23, len_gen_23])

        pr12.extend([loc_r12, gen_r12, len_pcr_12, len_gen_12])
        pr23.extend([loc_r23, gen_r23, len_pcr_23, len_gen_23])

        primers = [pf12, pr12, pf23, pr23]

        for p in primers:
            for t in p:
                output.write(str(t) + " ,")
            output.write("\n")
    output.close()