Exemple #1
0
    def search(self,dnaSeq,pValThresh=0.05,halfAT=0.25,halfGC=0.25):
        """Return motility hits tuple."""
        def _setThresh(newPval):
            """Return threshold that is approximately
            exquivilent to a p-value of 'pValThresh'"""
            maxScore = mot.max_score()
            minScore = mot.min_score() # to be used for lower bound (but later.. :( )
            scoreSteps = [maxScore*(x/100.0) for x in range(1,101)]
            scoreSteps.reverse()
            thresh = maxScore
            for score in scoreSteps:
                pVal = mot.weight_sites_over(score,AT_bias=halfAT, GC_bias=halfGC)
                print 'score: %s pVal: %s ' % (score,pVal)
                if pVal <= newPval:
                    thresh = score
                elif pVal > newPval:
                    break
                if pVal <= newPval:
                    return thresh
                else:
                    return None

        if not self._motility:
            self._motility = motility.make_pwm(self._getMotilityMatrix())
            self._motility.threshPval  = pValThresh
            self._motility.threshScore = _setThresh(pValThresh)
        if not self._motility.threshPval == pValThresh:
            self._motility.threshScore = _setThresh()
Exemple #2
0
def test_5():
    """
    Test calc_score & calc_energy equivalence
    """
    motif = 'ACGG'

    pwm = motility.make_pwm([motif])
    operator = motility.make_operator([motif])

    print pwm.calc_score(motif)

    print operator.calc_score(motif)
    print operator.calc_energy(motif)
    assert operator.calc_score(motif) == operator.calc_energy(motif)
def test_5():
    """
    Test calc_score & calc_energy equivalence
    """
    motif = 'ACGG'
    
    pwm = motility.make_pwm([motif])
    operator = motility.make_operator([motif])

    print pwm.calc_score(motif)

    print operator.calc_score(motif)
    print operator.calc_energy(motif)
    assert operator.calc_score(motif) == operator.calc_energy(motif)
Exemple #4
0
def test_4():
    """
    Test misc coord handling / match str extraction
    """
    motif = 'ACGG'

    pwm = motility.make_pwm([motif])
    pwm_match = pwm.find(motif, 4)
    iupac_match = motility.find_iupac(motif, motif)
    exact_match = motility.find_exact(motif, motif)

    assert pwm_match == iupac_match
    assert pwm_match == exact_match

    rcmotif = 'CCGT'

    pwm_match = pwm.find(rcmotif, 4)
    iupac_match = motility.find_iupac(rcmotif, motif)
    exact_match = motility.find_exact(rcmotif, motif)

    assert pwm_match == iupac_match
def test_4():
    """
    Test misc coord handling / match str extraction
    """
    motif = 'ACGG'
    
    pwm = motility.make_pwm([motif])
    pwm_match = pwm.find(motif, 4)
    iupac_match = motility.find_iupac(motif, motif)
    exact_match = motility.find_exact(motif, motif)

    assert pwm_match == iupac_match
    assert pwm_match == exact_match

    rcmotif = 'CCGT'

    pwm_match = pwm.find(rcmotif, 4)
    iupac_match = motility.find_iupac(rcmotif, motif)
    exact_match = motility.find_exact(rcmotif, motif)

    assert pwm_match == iupac_match
Exemple #6
0
def get_consensus_snv_arrary(
    dic_chr_pos_snv, chr, snv_pos_list, snv_arrary_list
):  # get consensus SNV (also called 'phasing') using Position-Weight Matrices (commonly used in DNA/RNA/Protein motif finding)
    # first step: filter some SNV positions with many missing data
    retain_snv_idx = []
    retain_snv_pos_list = []
    for i in range(len(snv_pos_list)):
        ret_snv_count = 0
        for snv_arrary in snv_arrary_list:
            if snv_arrary[i] != "-":
                ret_snv_count += 1
        if float(ret_snv_count) / len(
                snv_arrary_list
        ) >= 0.5:  # require >50% long read have nucleotide
            retain_snv_idx.append(i)
            retain_snv_pos_list.append(str(snv_pos_list[i]))
    retain_snv_pos = ",".join(retain_snv_pos_list)

    # second step: get the consensus (phased major allele)
    if retain_snv_idx != []:  # have valid SNV position for constructing new SNV arrary
        # obtain new SNV arrary
        new_snv_arrary_list = []
        for snv_arrary in snv_arrary_list:
            new_snv_arrary = ""
            for idx in retain_snv_idx:
                new_snv_arrary += snv_arrary[idx]
            new_snv_arrary_list.append(new_snv_arrary)

        # obtain SNV arrary for constructing consensus
        consus_snv_arrary_list = []
        for new_snv_arrary in new_snv_arrary_list:
            if "-" not in new_snv_arrary:
                consus_snv_arrary_list.append(new_snv_arrary)

        if consus_snv_arrary_list != []:  # have valid SNV arrary for constructing consensus
            # get major allele by most common element in the list
            counter_cm_ele_dic = collections.Counter(consus_snv_arrary_list)
            most_cm_ele = counter_cm_ele_dic.most_common(2)
            if most_cm_ele[0][
                    1] >= 5:  # if the snv_arrary is support by >= 5 long reads
                major_allele = most_cm_ele[0][0]
            else:
                # get major allele by PWM
                pwm = motility.make_pwm(
                    consus_snv_arrary_list)  # PWM calculate
                dic_snv_uniq_score = {}
                for snv_uniq in set(consus_snv_arrary_list):
                    snv_score = float(pwm.calc_score(snv_uniq))
                    if snv_score not in dic_snv_uniq_score.keys():
                        dic_snv_uniq_score[snv_score] = []
                        dic_snv_uniq_score[snv_score].append(snv_uniq)
                    else:
                        dic_snv_uniq_score[snv_score].append(snv_uniq)
                    max_consus_snv = dic_snv_uniq_score[max(
                        dic_snv_uniq_score.keys()
                    )][0]  # get SNV arrary with maximal PWM score as major allele
                major_allele = max_consus_snv

            # get minor allele
            minor_phase_list = []
            for i in range(len(retain_snv_pos_list)):
                if dic_chr_pos_snv[chr][int(
                        retain_snv_pos_list[i])][0] != major_allele[i]:
                    minor_phase_list.append(dic_chr_pos_snv[chr][int(
                        retain_snv_pos_list[i])][0])
                else:
                    minor_phase_list.append(dic_chr_pos_snv[chr][int(
                        retain_snv_pos_list[i])][1])
            minor_allele = "".join(minor_phase_list)

            # quantify allele-specific read count for both alleles
            major_allele_c, minor_allele_c, uncertain_allele_c = 0, 0, 0
            for new_snv_arrary in new_snv_arrary_list:
                common_count_major = sum(
                    1 for a, b in zip(major_allele, new_snv_arrary) if a == b)
                common_count_minor = sum(
                    1 for a, b in zip(minor_allele, new_snv_arrary) if a == b)
                if float(common_count_major) / len(major_allele) > 0.5:
                    major_allele_c += 1
                elif float(common_count_minor) / len(minor_allele) > 0.5:
                    minor_allele_c += 1
                else:
                    uncertain_allele_c += 1

            output_res = "\t".join([
                retain_snv_pos, major_allele, minor_allele,
                str(major_allele_c),
                str(minor_allele_c),
                str(uncertain_allele_c)
            ])
        else:
            output_res = "\t".join(["*", "*", "*", "*", "*", "*"])
    else:
        output_res = "\t".join(["*", "*", "*", "*", "*", "*"])
    return output_res
from TAMO.MotifTools import Motif
import motility

tM = Motif('WGATAR')
sites = tM.bogus_kmers()

tM = Motif(sites)
mM = motility.make_pwm(sites)


s = 'ATGCATGCTAGCGGCTGATAACGCTTATCATATGC'

mReults = mM.find(s,mM.max_score()*0.75,)
targetGenes = "/Users/biggus/Documents/James/Data/ReClustering/kmedsPear33Clus50x_2/Clus2_247genes.genes.txt"
targetGenes = map(lambda l: l.strip(), open(targetGenes, "rU"))
targetGenes = targetGenes[:genes]
for i in range(len(targetGenes)):
    targetGenes[i] = seqs[targetGenes[i]]


motifs = []
tMotifs = []
mMotifs = []
for t in tmoFiles:
    Ms = loadTMOs(t)
    motifs.extend(Ms)
for i in range(len(motifs)):
    tMotifs.append(Motif(motifs[i].bogus_kmers()))
    mMotifs.append(motility.make_pwm(motifs[i].bogus_kmers()))


tTimeIt = """for m in tMotifs:
    for s in targetGenes:
        m.scan(s,factor=0.75)
"""
mTimeIt = """for m in mMotifs:
    for s in targetGenes:
        m.find(s,m.max_score()*0.85)
"""


tTimer = timeit.Timer(tTimeIt, "from __main__ import tMotifs,targetGenes")
mTimer = timeit.Timer(mTimeIt, "from __main__ import mMotifs,targetGenes")