Esempio n. 1
0
def findOutcastOfLength(seqs, length, shiftStep=0.1, norm="each", mdl=False):

    # if numShifts < 0:
    # 	numShifts = 1
    # stepLen = shiftStep * length

    motif = findMotifOfLengthFast(seqs[:], length, norm=norm)
    Xnorm = motif.Xnorm

    seedIdxs = computeAllSeedIdxsFromPair(motif.idx1, motif.idx2, shiftStep)
    # XXX this assumes only one seq
    seedIdxs, _ = startsAndEndsWithinBounds(seedIdxs, length, len(seqs[0]))

    bestScore = -np.inf
    bestOutcast = None
    for idx in seedIdxs:
        seed = Xnorm[idx]
        if mdl:
            info = findOutcastInstancesMDL(Xnorm, seed, length)
        else:
            info = findOutcastInstances(Xnorm, seed, length)
        if info and info.score > bestScore:
            bestScore = info.score
            bestOutcast = info

    print "bestOutcast idxs at length {}: {}".format(length, bestOutcast.idxs, bestOutcast.length)

    return bestOutcast
Esempio n. 2
0
def findOutcastOfLength(seqs, length, shiftStep=.1, norm='each', mdl=False):

    # if numShifts < 0:
    # 	numShifts = 1
    # stepLen = shiftStep * length

    motif = findMotifOfLengthFast(seqs[:], length, norm=norm)
    Xnorm = motif.Xnorm

    seedIdxs = computeAllSeedIdxsFromPair(motif.idx1, motif.idx2, shiftStep)
    # XXX this assumes only one seq
    seedIdxs, _ = startsAndEndsWithinBounds(seedIdxs, length, len(seqs[0]))

    bestScore = -np.inf
    bestOutcast = None
    for idx in seedIdxs:
        seed = Xnorm[idx]
        if mdl:
            info = findOutcastInstancesMDL(Xnorm, seed, length)
        else:
            info = findOutcastInstances(Xnorm, seed, length)
        if info and info.score > bestScore:
            bestScore = info.score
            bestOutcast = info

    print "bestOutcast idxs at length {}: {}".format(length, bestOutcast.idxs,
                                                     bestOutcast.length)

    return bestOutcast
Esempio n. 3
0
def learnFF(seq,
            X,
            Xblur,
            Lmin,
            Lmax,
            Lfilt,
            generateSeedsAlgo=DEFAULT_SEEDS_ALGO,
            generalizeSeedsAlgo=DEFAULT_GENERALIZE_ALGO,
            extractTrueLocsAlgo=DEFAULT_EXTRACT_LOCS_ALGO,
            generateSeedsStep=.1,
            padBothSides=True,
            **generalizeKwargs):

    padLen = (len(seq) - X.shape[1]) // 2
    if padBothSides:
        X = ar.addZeroCols(X, padLen, prepend=True)
        X = ar.addZeroCols(X, padLen, prepend=False)
        Xblur = ar.addZeroCols(Xblur, padLen, prepend=True)
        Xblur = ar.addZeroCols(Xblur, padLen, prepend=False)

    tStartSeed = time.clock()

    # find seeds; i.e., candidate instance indices from which to generalize
    numShifts = int(1. / generateSeedsStep) + 1
    stepLen = int(Lmax * generateSeedsStep)
    windowLen = Lmax + stepLen
    print "learnFF(): stepLen, numShifts", stepLen, numShifts

    if generateSeedsAlgo == 'pair':
        searchLen = (Lmin + Lmax) // 2
        motif = findMotifOfLengthFast([seq], searchLen)
        seedIdxs = [motif.idx1, motif.idx2]
        print "seedIdxs from motif: ", seedIdxs
        seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen)

    elif generateSeedsAlgo == 'all':
        seedIdxs = np.arange(X.shape[1] - windowLen)  # TODO remove after debug

    elif generateSeedsAlgo == 'random':
        seedIdxs = list(np.random.choice(np.arange(len(seq) - Lmax), 2))
        seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen)

    elif generateSeedsAlgo == 'walk':
        # score all subseqs based on how much they don't look like random walks
        # when examined using different sliding window lengths
        scores = np.zeros(len(seq))
        for dim in range(seq.shape[1]):
            # compute these just once, not once per length
            dimData = seq[:, dim].ravel()
            diffs = dimData[1:] - dimData[:-1]
            std = np.std(diffs)
            for divideBy in [1, 2, 4, 8]:
                partialScores = windowScoresRandWalk(dimData,
                                                     Lmin // divideBy,
                                                     std=std)
                scores[:len(partialScores)] += partialScores

        # figure out optimal seeds based on scores of all subseqs
        bestIdx = np.argmax(scores)
        start = max(0, bestIdx - Lmin)
        end = min(len(scores), start + Lmin)
        scores[start:end] = -1
        secondBestIdx = np.argmax(scores)

        seedIdxs = [bestIdx, secondBestIdx]
        seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen)
    else:
        raise NotImplementedError(
            "Only algo 'pair' supported to generate seeds"
            "; got unrecognized algo {}".format(generateSeedsAlgo))

    # compute start and end indices of seeds to try
    seedStartIdxs = np.sort(np.array(seedIdxs))
    seedStartIdxs = seedStartIdxs[seedStartIdxs >= 0]
    seedStartIdxs = seedStartIdxs[seedStartIdxs < X.shape[1] - windowLen]
    seedEndIdxs = seedStartIdxs + windowLen

    print "learnFF(): seedIdxs after removing invalid idxs: ", seedStartIdxs
    print "learnFF(): fraction of idxs used as seeds: {}".format(
        len(seedStartIdxs) / float(len(seq)))

    tEndSeed = time.clock()

    generalizeKwargs['windowLen'] = windowLen  # TODO remove after prototype

    bsfScore, bsfLocs, bsfFilt = findInstancesUsingSeedLocs(
        X,
        Xblur,
        seedStartIdxs,
        seedEndIdxs,
        Lmin,
        Lmax,
        Lfilt,
        generalizeSeedsAlgo=generalizeSeedsAlgo,
        **generalizeKwargs)

    # print "learnFF(): got bsfFilt shape", bsfFilt.shape

    startIdxs, endIdxs = extractTrueLocs(
        X,
        Xblur,
        bsfLocs,
        bsfFilt,
        windowLen,
        Lmin,
        Lmax,
        extractTrueLocsAlgo=extractTrueLocsAlgo)

    tEndFF = time.clock()
    print "learnFF(): seconds to find seeds, locs, total =\n\t{:.3f}\t{:.3f}\t{:.3f}".format(
        tEndSeed - tStartSeed, tEndFF - tEndSeed, tEndFF - tStartSeed)

    return startIdxs, endIdxs, bsfFilt
Esempio n. 4
0
def learnFF(seq, X, Xblur, Lmin, Lmax, Lfilt,
	generateSeedsAlgo=DEFAULT_SEEDS_ALGO,
	generalizeSeedsAlgo=DEFAULT_GENERALIZE_ALGO,
	extractTrueLocsAlgo=DEFAULT_EXTRACT_LOCS_ALGO,
	generateSeedsStep=.1, padBothSides=True, **generalizeKwargs):

	padLen = (len(seq) - X.shape[1]) // 2
	if padBothSides:
		X = ar.addZeroCols(X, padLen, prepend=True)
		X = ar.addZeroCols(X, padLen, prepend=False)
		Xblur = ar.addZeroCols(Xblur, padLen, prepend=True)
		Xblur = ar.addZeroCols(Xblur, padLen, prepend=False)

	tStartSeed = time.clock()

	# find seeds; i.e., candidate instance indices from which to generalize
	numShifts = int(1. / generateSeedsStep) + 1
	stepLen = int(Lmax * generateSeedsStep)
	windowLen = Lmax + stepLen
	print "learnFF(): stepLen, numShifts", stepLen, numShifts

	if generateSeedsAlgo == 'pair':
		searchLen = (Lmin + Lmax) // 2
		motif = findMotifOfLengthFast([seq], searchLen)
		seedIdxs = [motif.idx1, motif.idx2]
		print "seedIdxs from motif: ", seedIdxs
		seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen)

	elif generateSeedsAlgo == 'all':
		seedIdxs = np.arange(X.shape[1] - windowLen) # TODO remove after debug

	elif generateSeedsAlgo == 'random':
		seedIdxs = list(np.random.choice(np.arange(len(seq) - Lmax), 2))
		seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen)

	elif generateSeedsAlgo == 'walk':
		# score all subseqs based on how much they don't look like random walks
		# when examined using different sliding window lengths
		scores = np.zeros(len(seq))
		for dim in range(seq.shape[1]):
			# compute these just once, not once per length
			dimData = seq[:, dim].ravel()
			diffs = dimData[1:] - dimData[:-1]
			std = np.std(diffs)
			for divideBy in [1, 2, 4, 8]:
				partialScores = windowScoresRandWalk(dimData, Lmin // divideBy,
					std=std)
				scores[:len(partialScores)] += partialScores

		# figure out optimal seeds based on scores of all subseqs
		bestIdx = np.argmax(scores)
		start = max(0, bestIdx - Lmin)
		end = min(len(scores), start + Lmin)
		scores[start:end] = -1
		secondBestIdx = np.argmax(scores)

		seedIdxs = [bestIdx, secondBestIdx]
		seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen)
	else:
		raise NotImplementedError("Only algo 'pair' supported to generate seeds"
			"; got unrecognized algo {}".format(generateSeedsAlgo))

	# compute start and end indices of seeds to try
	seedStartIdxs = np.sort(np.array(seedIdxs))
	seedStartIdxs = seedStartIdxs[seedStartIdxs >= 0]
	seedStartIdxs = seedStartIdxs[seedStartIdxs < X.shape[1] - windowLen]
	seedEndIdxs = seedStartIdxs + windowLen

	print "learnFF(): seedIdxs after removing invalid idxs: ", seedStartIdxs
	print "learnFF(): fraction of idxs used as seeds: {}".format(
		len(seedStartIdxs) / float(len(seq)))

	tEndSeed = time.clock()

	generalizeKwargs['windowLen'] = windowLen # TODO remove after prototype

	bsfScore, bsfLocs, bsfFilt = findInstancesUsingSeedLocs(X, Xblur,
		seedStartIdxs, seedEndIdxs, Lmin, Lmax, Lfilt,
		generalizeSeedsAlgo=generalizeSeedsAlgo,
		**generalizeKwargs)

	# print "learnFF(): got bsfFilt shape", bsfFilt.shape

	startIdxs, endIdxs = extractTrueLocs(X, Xblur, bsfLocs, bsfFilt, windowLen,
		Lmin, Lmax, extractTrueLocsAlgo=extractTrueLocsAlgo)

	tEndFF = time.clock()
	print "learnFF(): seconds to find seeds, locs, total =\n\t{:.3f}\t{:.3f}\t{:.3f}".format(
		tEndSeed - tStartSeed, tEndFF - tEndSeed, tEndFF - tStartSeed)

	return startIdxs, endIdxs, bsfFilt