Example #1
0
def testUnique(seqs, Ws, prefixes=('ACA',)):
    """Check we count the unique W-mers correctly by counting them in
    an independent way."""
    import seqan
    import numpy as npy
    from jemima.wmers import countWmersMulti
    index = seqan.IndexStringDNA5SetESA(seqs)
    counts = npy.zeros((2*len(index), len(Ws)), dtype=int)
    numunique = countWmersMulti(
        index.topdownhistory(), Ws, counts, countunique=True)
    unique = _countUnique(seqs, Ws)
    if (npy.array([len(unique[W]) for W in Ws]) != numunique).all():
        raise ValueError('Counts did not match.')
    for prefix in prefixes:
        it = index.topdownhistory()
        if not it.goDown(prefix):
            raise ValueError('Prefix "%s" does not exist in text' % prefix)
        for Widx, W in enumerate(Ws):
            count1 = len(filter(
                lambda wmer: wmer.startswith(prefix), unique[W]))
            count2 = counts[it.value.id, Widx]
            if count1 != count2:
                raise ValueError(
                    'Counts for "%s" did not match: %s != %s' % (
                        prefix, count1, count2))
Example #2
0
def testUnique(seqs, Ws, prefixes=('ACA', )):
    """Check we count the unique W-mers correctly by counting them in
    an independent way."""
    import seqan
    import numpy as npy
    from jemima.wmers import countWmersMulti
    index = seqan.IndexStringDNA5SetESA(seqs)
    counts = npy.zeros((2 * len(index), len(Ws)), dtype=int)
    numunique = countWmersMulti(index.topdownhistory(),
                                Ws,
                                counts,
                                countunique=True)
    unique = _countUnique(seqs, Ws)
    if (npy.array([len(unique[W]) for W in Ws]) != numunique).all():
        raise ValueError('Counts did not match.')
    for prefix in prefixes:
        it = index.topdownhistory()
        if not it.goDown(prefix):
            raise ValueError('Prefix "%s" does not exist in text' % prefix)
        for Widx, W in enumerate(Ws):
            count1 = len(
                filter(lambda wmer: wmer.startswith(prefix), unique[W]))
            count2 = counts[it.value.id, Widx]
            if count1 != count2:
                raise ValueError('Counts for "%s" did not match: %s != %s' %
                                 (prefix, count1, count2))
Example #3
0
def _countWmers(index, Ws, countunique):
    counts = npy.zeros((2*len(index), len(Ws)), dtype=npy.uint)
    rootcounts = wmers.countWmersMulti(
        index.topdownhistory(), Ws, counts, countunique=countunique)
    # Count how many W-mers are represented by the children
    # of each node
    childfreqs = npy.zeros((2*len(index), len(Ws), jem.SIGMA))
    wmers.countChildren(
        index.topdownhistory(), Ws, counts, childfreqs)
    childfreqs = jem.normalisearray(childfreqs)
    return rootcounts, counts, childfreqs
Example #4
0
def _countWmers(index, Ws, countunique):
    counts = npy.zeros((2 * len(index), len(Ws)), dtype=npy.uint)
    rootcounts = wmers.countWmersMulti(index.topdownhistory(),
                                       Ws,
                                       counts,
                                       countunique=countunique)
    # Count how many W-mers are represented by the children
    # of each node
    childfreqs = npy.zeros((2 * len(index), len(Ws), jem.SIGMA))
    wmers.countChildren(index.topdownhistory(), Ws, counts, childfreqs)
    childfreqs = jem.normalisearray(childfreqs)
    return rootcounts, counts, childfreqs
Example #5
0
def countWmersMulti(it, Ws, counts, countunique=True):
    """Count all the :math:`W`-mer occurrences (or unique W-mers)
    below the iterator for multiple widths, Ws.

    Arguments:
        - *it*: The iterator below which to count occurrences.
        - *Ws*: The widths to count for.
        - *counts*: The counts array of shape
            (2*len(index), len(Ws))
        - *countunique*: If true, count the number of unique W-mers below
            each vertex for each width. Otherwise count the number of
            occurrences.
    """
    nodecounts = counts[it.value.id]
    maxW = Ws[-1]
    firstunknown = findfirstparentunknown(it, maxW)
    # Do we have to descend any further? Is our representative as long as
    # largest W? Did we find an unknown base?
    if firstunknown == it.repLength and it.repLength < maxW:
        # Yes we should descend so go down and add up counts from child nodes
        if it.goDown():
            while True:
                nodecounts += countWmersMulti(it, Ws, counts, countunique)
                if not it.goRight():
                    break
            it.goUp()
    # Determine which Ws our representative is longer than
    longestWidx = bisect.bisect(Ws, firstunknown)
    # Determine which Ws our parent representative is longer than
    parentWidx = not it.isRoot and bisect.bisect(
        Ws[:longestWidx],
        it.repLength - it.parentEdgeLength) or 0
    # Set those counts to number of occurrences
    if countunique:
        nodecounts[parentWidx:longestWidx] = 1
    else:
        nodecounts[parentWidx:longestWidx] = it.numOccurrences
    return nodecounts
Example #6
0
def countWmersMulti(it, Ws, counts, countunique=True):
    """Count all the :math:`W`-mer occurrences (or unique W-mers)
    below the iterator for multiple widths, Ws.

    Arguments:
        - *it*: The iterator below which to count occurrences.
        - *Ws*: The widths to count for.
        - *counts*: The counts array of shape
            (2*len(index), len(Ws))
        - *countunique*: If true, count the number of unique W-mers below
            each vertex for each width. Otherwise count the number of
            occurrences.
    """
    nodecounts = counts[it.value.id]
    maxW = Ws[-1]
    firstunknown = findfirstparentunknown(it, maxW)
    # Do we have to descend any further? Is our representative as long as
    # largest W? Did we find an unknown base?
    if firstunknown == it.repLength and it.repLength < maxW:
        # Yes we should descend so go down and add up counts from child nodes
        if it.goDown():
            while True:
                nodecounts += countWmersMulti(it, Ws, counts, countunique)
                if not it.goRight():
                    break
            it.goUp()
    # Determine which Ws our representative is longer than
    longestWidx = bisect.bisect(Ws, firstunknown)
    # Determine which Ws our parent representative is longer than
    parentWidx = not it.isRoot and bisect.bisect(
        Ws[:longestWidx], it.repLength - it.parentEdgeLength) or 0
    # Set those counts to number of occurrences
    if countunique:
        nodecounts[parentWidx:longestWidx] = 1
    else:
        nodecounts[parentWidx:longestWidx] = it.numOccurrences
    return nodecounts
Example #7
0
# jem.logo(runx1pwm, 'runx1')
# jem.logo(runx1withpc, 'runx1-pc')

logging.info('Loading sequences')
# seqs = seqan.StringDNASet(('AAAAAAAA', 'ACGTACGT', 'TATATATA'))
numbases, seqs, ids = seqan.readFastaDNA('T00759-small.fa')
logging.info('Loaded %d bases from %d sequences', numbases, len(seqs))
lambda_ = len(seqs) / float(numbases)

logging.info('Building index')
index = seqan.IndexStringDNASetESA(seqs)

logging.info('Counting W-mers')
Ws = [W]
Wmercounts = npy.zeros((2*len(index), len(Ws)), dtype=npy.uint)
numWmers = wmers.countWmersMulti(index.topdownhistory(), Ws, Wmercounts)[0]
logging.info('Got %d %d-mers', numWmers, W)
childWmerfreqs = npy.zeros((2*len(index), len(Ws), jem.SIGMA))
wmers.countWmerChildren(index.topdownhistory(), W, Wmercounts, childWmerfreqs)
childWmerfreqs = jem.normalisearray(childWmerfreqs)
sumestimator = jis.makesumestimator(numWmers)

logging.info('Importance sampling using background model to find one seed')
rdm.seed(2)
memocb = jis.importancesample(
    index, W, childWmerfreqs[:, 0], jis.UniformImportanceWeight(),
    numsamples=1, callback=jis.ISCbMemo())
pwm = jem.pwmfromWmer(memocb.Xns[0], numseedsites, 1.)
jem.logo(pwm, 'seed')

numsamples = 3000