def BJ(self, gamma=0.1) : """ Exact Berk-Jones statistic Args: ----- 'gamma' : lower fraction of P-values to consider Returns: ------- -log(BJ) score, P-value attaining it """ spv = self._pvals N = self._N if N == 0 : return np.nan, np.nan bj = spv[0] p_th = spv[0] ii = np.arange(1, N + 1) max_i = max(1, int(gamma * len(ii))) if len(spv) >= 1 : BJpv = beta.sf(spv, ii, N - ii + 1)[:max_i] i_star = np.argmin(BJpv) bj = BJpv[i_star] p_th = spv[i_star] return -np.log(bj), p_th
def beta_sf_wrapper(q, a, b): """ wrapper function for beta.sf to not consider low fpd locations beta.sf is equivalent to "1 - beta.cdf" """ if a / (a + b) < c.low_score: return c.low_score return beta.sf(q, a, b)
def ensure_balanced_hets(seg_table, het_table): seg_table['aSCNA'] = np.zeros([len(seg_table), 1]) aSCNA_hets = [] for seg_id, seg in seg_table.iterrows(): seg_hets = het_table[het_table['seg_id'] == seg_id] if np.sum(seg_hets['d'] == -1) > 10 and np.sum( seg_hets['d'] == 1) > 10: alts = np.concatenate([ np.array( seg_hets['ALT_COUNT_T'][np.array(seg_hets['d'] == -1)]), np.array(seg_hets['REF_COUNT_T'][np.array(seg_hets['d'] == 1)]) ]) refs = np.concatenate([ np.array( seg_hets['ALT_COUNT_T'][np.array(seg_hets['d'] == 1)]), np.array( seg_hets['REF_COUNT_T'][np.array(seg_hets['d'] == -1)]) ]) f = np.mean(np.true_divide(alts, alts + refs)) seg_hets = seg_hets[np.logical_and( beta.sf(f, alts + 1, refs + 1) < 0.995, beta.sf(f, alts + 1, refs + 1) > 0.005)] if sum(seg_hets['AF_N'] > 0.5) < sum(seg_hets['AF_N'] <= 0.5): sites = seg_hets['AF_N'] <= 0.5 index = list(compress(xrange(len(sites)), sites)) ixs = random.sample(index, (sum(seg_hets['AF_N'] <= 0.5) - sum(seg_hets['AF_N'] > 0.5))) seg_hets = seg_hets.drop(seg_hets.index[[ixs]]) seg_hets.reset_index(inplace=True, drop=True) if sum(seg_hets['AF_N'] > 0.5) > sum(seg_hets['AF_N'] <= 0.5): sites = seg_hets['AF_N'] > 0.5 index = list(compress(xrange(len(sites)), sites)) ixs = random.sample(index, (sum(seg_hets['AF_N'] > 0.5) - sum(seg_hets['AF_N'] <= 0.5))) seg_hets = seg_hets.drop(seg_hets.index[[ixs]]) seg_hets.reset_index(inplace=True, drop=True) if len(aSCNA_hets) == 0: aSCNA_hets = seg_hets else: aSCNA_hets = pd.concat([aSCNA_hets, seg_hets]) aSCNA_hets.reset_index(inplace=True, drop=True) return aSCNA_hets
def sf(self, val) -> Array: """ Calculates the survival function for a set of inputs Parameters ---------- val: numeric or numeric-array Values to return the survival function calculation on Returns ------- Array: survival function based on the val parameter """ x = ((val - self.a) / self.range).clip(0, 1) sf_val = beta_dist.sf(x, self.alpha, self.beta) return sf_val
def solve(self, x, y): if (not x and not y) or len(x) != len(y): return [None, None] n = len(x) x_mean = np.mean(x) y_mean = np.mean(y) x_delta = [xi - x_mean for xi in x] y_delta = [yi - y_mean for yi in y] sum_xy = np.sum([x_delta[i] * y_delta[i] for i in range(0, n)]) sum_x2 = np.sum([xd**2 for xd in x_delta]) sum_y2 = np.sum([yd**2 for yd in y_delta]) r_val = sum_xy / np.sqrt(sum_x2) / np.sqrt(sum_y2) if abs(r_val) == 1.0: p_val = 0.0 else: t2 = r_val**2 * ((n - 2) / ((1.0 - r_val) * (1.0 + r_val))) p_val = 1 - beta.sf((n - 2) / (n - 2 + t2), 0.5 * (n - 2), 0.5) return [round(r_val, 6), round(p_val, 6)]
def Beta_ab_cdf(a, b): """ calculate I(a,b) defined in the paper, using the function beta.sf :param a: beta distribution parameter a :param b: beta distribution parameter b :return: Pr( theta > 0.5 | theta ~ Beta(a, b) ) """ if beta_dic.get((a, b)) == None: beta_dic[(a, b)] = beta.sf(0.5,a, b) I_ab = beta_dic[(a, b)] # a = float(a) # b = float(b) # I_ab = 1 - eng.cdf('Beta', 0.5, a, b) # I_ab = 0.3 # I_ab = beta.sf(0.5, a, b) return I_ab
def tryFindRelevantSubseq(): # synth.seedRng(123) NLL = np.inf nExamples = 3 nNoise = 10 downsampleBy = 5 # seqs = synth.makeSinesDataset(numSines=nExamples, numNoise=nNoise) seqs = synth.makeSinesDataset(numSines=nExamples, numNoise=nNoise, warped=True) # seqs = synth.makeSinesDataset(numSines=0, numNoise=nNoise+nExamples) # all noise # seqs = seqs[:nExamples] # only positive examples, no noise for now # seqs = map(lambda s: s + synth.randconst(s.shape, std=.25), seqs) seqs = map(lambda s: ar.downsampleMat(s, rowsBy=downsampleBy), seqs) # length = 40 / downsampleBy # pruneCorr = -1 # mat = sub.similarityMat(seqs, length, pruneCorrAbove=pruneCorr) # plt.figure() # plt.imshow(mat) # plt.show() # return # for s in seqs: # plt.figure() # plt.plot(s) # plt.show() # return # plt.plot(np.vstack(seqs).T) # yep, looks right # plt.show() # simMat = ff2.computeSimMat(seqs, 8, .2, k=10, matForEachSeq=False) # plt.imshow(simMat) # plt.show() # length = 40 / downsampleBy # length = 60 / downsampleBy length = 8 # dMax = length * (1 - minSim) # NOT actually enforcing same threshold dMax = length * .1 # minSim = .5 # not equivalent to above dMax minSim = 0. # not equivalent to above dMax # dMax = length * 0 # Xs = ff2.computeSimMat(seqs, length, dMax, k=(2*length), matForEachSeq=True) Xs = ff2.computeSimMat(seqs, length, dMax, matForEachSeq=True, removeSelfMatch=True) # normFeatures='mean') # works better than znorming # normFeatures='z') # Xnorms = Xs # plt.figure() # plt.imshow(Xs) # plt.show() # return # ------------------------ kill small values Xs = map(lambda X: X * (X > minSim), Xs) # ------------------------ extract relative maxima # Xs = map(lambda X: ff2.localMaxFilterSimMat(X), Xs) # ------------------------ temporal pooling # Xs = map(lambda X: ff2.filterSimMat(X, length-1, 'hamming'), Xs) # Xs = map(lambda X: ff2.filterSimMat(X, length-1, 'flat'), Xs) # ------------------------ normalize mean # use mean for each feature (row) X_combined = np.hstack(Xs) featureMeans = np.mean(X_combined, axis=1).reshape((-1, 1)) Xnorms = map(lambda X: X - featureMeans, Xs) # use grand mean X_combined = np.hstack(Xs) # grandMean = np.mean(X_combined) # Xnorms = map(lambda X: X - grandMean, Xs) # no mean subtraction! # Xnorms = map(lambda X: X, Xs) # use mean for each element (row, col position) # X_mean = np.copy(Xs[0]) # for X in Xs[1:]: # X_mean += X # X_mean /= nExamples # Xnorms = map(lambda X: X - X_mean, Xs) # for X in Xnorms: # print "min", np.min(X) # print "max", np.max(X) # return # ------------------------ normalize variance # Xnorms = map(lambda x: ar.stdNormalizeRows(x), Xnorms) # print X_mean # print np.where(np.isnan(X_mean))[0] # return # for s, m in zip(seqs, Xs): # # for s, m in zip(seqs, Xnorms): # print m.shape # plt.figure() # ax1 = plt.subplot2grid((2,1), (0,0)) # ax2 = plt.subplot2grid((2,1), (1,0)) # ax2.autoscale(tight=True) # ax1.plot(s) # m2 = np.hstack((m, np.zeros((m.shape[0], length-1)))) # ax2.imshow(m2, interpolation='nearest', aspect='auto') # plt.show() # return # ax3.autoscale(tight=True) # ax2.imshow(synth.appendZeros(Xs[0], length-1), interpolation='nearest', aspect='auto') # ax1.set_title("Sequence containing sine wave") # ax2.set_title('Feature Representation of Sequence') # ax3.set_title('Learned Weights') # ax1 = plt.subplot2grid((1,3), (0,0)) # ax2 = plt.subplot2grid((1,3), (0,1)) # ax3 = plt.subplot2grid((1,3), (0,2)) # ax2.autoscale(tight=True) # ax3.autoscale(tight=True) # ax1.plot(seqs[0]) # ax2.imshow(synth.appendZeros(Xs[0], length-1), interpolation='nearest', aspect='auto') # ax1.set_title("Sequence containing sine wave") # ax2.set_title('Feature Representation of Sequence') # ax3.set_title('Learned Weights') # Y = np.empty(len(seqs)) W = np.ones(Xs[0].shape, dtype=np.float64) W /= np.linalg.norm(W) Cov0 = np.zeros(W.shape) + np.mean(np.var(X_combined, axis=1)) # variance of everything ever Cov = np.copy(Cov0) # W += (Xs[0] + Xs[1]) / 2. # lamda = 20. # lamda_scaleBy = 1. lamda_scaleBy = 0. penalty = np.copy(W) * lamda_scaleBy lamda = penalty[0][0] # penalty = np.zeros(Xs[0].shape) + 1 # plt.figure() # plt.imshow(W) nSeqs = len(seqs) ys = np.empty(nSeqs) dWs = np.empty((nSeqs, W.shape[0], W.shape[1])) dCovs = np.empty((nSeqs, W.shape[0], W.shape[1])) y0s = np.empty(nSeqs) # plt.figure() for ep in range(10): # print "w nan at", np.where(np.isnan(W))[0] for i in range(nSeqs): # X = Xs[i] Xnorm = Xnorms[i] # X - E[X] # ys[i] = np.sum(W * X) # print "y", y # dWs[i] = (Xnorm - W) * ys[i] # dW = (Xnorm - W) * np.exp(y) # print "max(X)", np.max(X) # print "max(X-E[X])", np.max(Xnorm) # print "max(dW)", np.max(dW) # ys[i] = np.sum(W * X) # dWs[i] = (X - W) # just vanilla avg # ys[i] = np.sum(W * Xnorm) # dWs[i] = (Xnorm - W) # just vanilla avg diff = Xnorm - W diff_sq = diff * diff ys[i] = np.sum(diff_sq / Cov) dWs[i] = diff dCovs[i] = diff_sq y0s[i] = np.sum(Xnorm * Xnorm / Cov0) # alpha = 1. # probs = np.exp(ys * alpha) # probs /= np.sum(probs) # probs = np.arange(nSeqs) < nExamples sortIdxs = np.argsort(ys)[::-1] # descending order p = float(nExamples) / nSeqs scaleBy = 10 positions = np.linspace(0., 1., nSeqs) betaProbs = beta.sf(positions, p*scaleBy, (1-p)*scaleBy) ySort = ys[sortIdxs] sigmoid = fitLogistic(ySort) # plt.plot(ySort / ySort[0], 'o') # probs = betaProbs # probs = sigmoid # print ys, y0s gaussDims = np.prod(Xnorms[0].shape) # divideDistsBy = np.sqrt(gaussDims) divideDistsBy = gaussDims probsPat = np.exp(-ys/divideDistsBy) probs0 = np.exp(-y0s/divideDistsBy) probs = probsPat / (probsPat + probs0) probs /= np.sum(probs) # set sum of update weights = 1 for i, p in enumerate(probs): idx = sortIdxs[i] dWs[i] *= probs[idx] dCovs[i] *= probs[idx] dW = np.sum(dWs, axis=0) dCov = np.sum(dCovs, axis=0) # print dW.shape lurn = 1. / (np.sqrt(ep+1)) # print "lurn", lurn W += lurn * dW covLambda = .95 Cov = covLambda * dCov + (1 - covLambda) * Cov0 # Cov += lurn * dCov # W /= np.linalg.norm(W) # make it zero not quite as much stuff # W /= np.size(W) # W = ff2.l1Project(W) # zeros almost everything ever # print np.sum(np.abs(W)) # W[W < .001 / W.size] = 0 # W -= np.maximum(np.abs(W), penalty) * np.sign(W) # W -= penalty * np.sign(W) # W[np.abs(W) < lamda] = 0. # W = np.maximum(0., W) # TODO proper projection onto L1 ball # W /= np.linalg.norm(W) # L2 constraint # W /= np.sum(W) # L1 constraint print ys # print probs print np.sum(ys) print np.dot(ys[sortIdxs], probs) # oldNLL = NLL NLL = -np.sum(np.log(probs)) # if oldNLL < NLL: # this is far from nondecreasing ATM # print "================================" # print "oldNLL %g < new NLL %g" % (oldNLL, NLL) # print "================================" print "NLL: ", NLL print "------------------------ /iter%d" % (ep + 1) # # logistic function seems to nail the split even better, although # # hard to know what would happen if data weren't so contrived # plt.figure() # # # ySort = ys[sortIdxs] / np.max(ys) # # ySort = ys[sortIdxs] # plt.plot(ySort / ySort[0], 'o') # # sigmoid = fitLogistic(ySort) # plt.plot(sigmoid, label='sigmoid') # plt.plot(betaProbs / np.max(betaProbs), label='beta') # prod = sigmoid * betaProbs # plt.plot(prod / np.max(prod), label='product') # plt.legend(loc='best') # plt.figure() # plt.imshow(W) # ------------------------ reconstruct stuff from time domain # Wscores = W*W / Cov # patScores = np.exp(-Cov) Wsq = W*W # print "Cov0 = ", Cov0[0,0] Wsq[Wsq < Cov0] = 0. # XXX remove hack to kill low weights # Wsq -= Cov0 zeroScores = Wsq / Cov0 # print np.mean(patScores) # print np.mean(zeroScores) # zeroScores = np.exp(-zeroScores) # scoresMat = 1. - patScores / (patScores + zeroScores) scoresMat = zeroScores # these are like identical, suggesting cov is basically proportional # to mean in most cases; apparently just picking big means is probably # better than picking big means with small covs # plt.figure() # plt.imshow(patScores) # plt.figure() # plt.imshow(zeroScores) # plt.colorbar() # print np.min(scoresMat, axis=0) # print np.max(scoresMat, axis=0) # scoresMat[scoresMat < np.max(scoresMat)/2] = 0. # Wscores = np.mean(scoresMat, axis=0) Wscores = np.mean(scoresMat, axis=0) while True: # repeatedly subtract .05 until just one section above 0 idxsAbove0 = np.where(Wscores > 0)[0] changes = idxsAbove0[1:] - idxsAbove0[:-1] if np.all(changes) <= 1 and np.min(Wscores) < 0: break Wscores -= .02 # Wow, this is terrible; really need a way to set this... # ^ perhaps figure out value we'd need to get just one contiguous positive # section somewhere start, end, _ = maxSubarray(Wscores) # patStart, patEnd = start - length/2, end + length/2 # patStart, patEnd = start + length/2, end + length/2 patStart, patEnd = start, end + length # ------------------------ show distro of W # plt.figure() # plt.plot(np.sort(W[W > 0.].flatten()), 'x') # ------------------------ viz learned weights and target seqs mainPlot = 1 if mainPlot: plt.figure(figsize=(10,7)) # plot sequences (and sum of weights at the top) axSeq1 = plt.subplot2grid((4,5), (0,0)) axSeq2 = plt.subplot2grid((4,5), (0,1)) axSeq3 = plt.subplot2grid((4,5), (0,2)) axWeightSums = plt.subplot2grid((4,5), (0,3)) for ax in (axSeq1, axSeq2, axSeq3, axWeightSums): ax.autoscale(tight=True) axSeq1.set_title("Instance #1") axSeq2.set_title("Instance #2") axSeq3.set_title("Instance #3") axWeightSums.set_title("Sum of Weights") axSeq1.plot(seqs[0]) axSeq2.plot(seqs[1]) axSeq3.plot(seqs[2]) # W = ff2.localMaxFilterSimMat(W) # W[W < .01] = 0. W[W < .05] = 0. Wpad = synth.appendZeros(W, length-1, axis=1) Wsums = np.sum(Wpad, axis=0) axWeightSums.plot(Wsums / np.max(Wsums)) # numNonzerosInCols = np.sum(Wpad > 0., axis=0) + 1. # print numNonzerosInCols # Wmeans = Wsums / numNonzerosInCols # axWeightSums.plot(Wmeans / np.max(Wmeans)) viz.plotRect(axWeightSums, 60 / downsampleBy, 140 / downsampleBy) # plot simMats for sequences axMat1 = plt.subplot2grid((4,5), (1,0), rowspan=3) axMat2 = plt.subplot2grid((4,5), (1,1), rowspan=3) axMat3 = plt.subplot2grid((4,5), (1,2), rowspan=3) axMat4 = plt.subplot2grid((4,5), (1,3), rowspan=3) for ax in (axMat1, axMat2, axMat3, axMat4): ax.autoscale(tight=True) for i, ax in enumerate((axMat1, axMat2, axMat3)): ax.set_title("Features {}".format(i)) # ax.plot(seqs[i]) # ax.imshow(W) ax.imshow(synth.appendZeros(Xs[i], length-1), interpolation='nearest', aspect='auto') axMat4.set_title("Means") axMat4.imshow(synth.appendZeros(W, length-1), interpolation='nearest', aspect='auto') viz.plotRect(axMat4, 60 / downsampleBy, 140 / downsampleBy) # plot weights of stuff for extraction axScores = plt.subplot2grid((4,5), (1,4), rowspan=3) axScores.autoscale(tight=True) axScores.set_title("Scores") axScores.imshow(synth.appendZeros(scoresMat, length-1), interpolation='nearest', aspect='auto') # plot extracted ts axExtract = plt.subplot2grid((4,5), (0,4)) axExtract.autoscale(tight=True) axExtract.set_title("Extracted Subsequences") for s in seqs[:nExamples]: axExtract.plot(s) viz.plotRect(axExtract, patStart, patEnd-1, color='g') plt.tight_layout(pad=.01) # Wmeans = np.mean(np.abs(W), axis=0) # Wmeans = np.mean(W*W, axis=0) # means = map(lambda X: np.mean(X*X), Xnorm) # mean = reduce(lambda x1, x2: (x1 + x2), means) # mean /= len(Xnorm) # # penalty = np.zeros(len(Wmeans)) + mean # # cumPenalty = np.cumsum(penalty) # Wscores = Wmeans - mean # Wscores -= np.log(.7) - np.log(.3) # difference in log probs of mean vs pattern gauss # Wscores = np.maximum(0, Wscores) # print np.min(Wscores) # print np.max(Wscores) # Wscores[Wscores < np.max(Wscores)/10] = 0. # plt.figure() # plt.imshow(scoresMat) # plt.figure() # # plt.plot(Wmeans) # # plt.gca().ticklabel_format(axis='y', style='plain') # stop being sci notation! # plt.plot(Wscores) # # plt.ylim((np.min(Wscores), 1.)) # start, end, _ = maxSubarray(Wscores) # # end -= 1 # returned end idx isn't inclusive # print "start, end", start, end # viz.plotRect(plt.gca(), start, end-1) # # patStart, patEnd = start - length/2, end + length/2 # # patStart, patEnd = start + length/2, end + length/2 # patStart, patEnd = start, end + length # viz.plotRect(plt.gca(), patStart, patEnd-1, color='g') # # plt.plot(np.cumsum(Wmeans) - cumPenalty) # # plt.plot(np.cumsum(Wmeans[::-1])[::-1] - cumPenalty[::-1]) # plt.figure() # for s in seqs[:nExamples]: # plt.plot(s) # viz.plotRect(plt.gca(), patStart, patEnd-1, color='g') # # plt.plot(ar.meanNormalizeCols(s[patStart:patEnd])) plt.show()
def tryFindRelevantSubseq(): # synth.seedRng(123) NLL = np.inf nExamples = 3 nNoise = 10 downsampleBy = 5 # seqs = synth.makeSinesDataset(numSines=nExamples, numNoise=nNoise) seqs = synth.makeSinesDataset(numSines=nExamples, numNoise=nNoise, warped=True) # seqs = synth.makeSinesDataset(numSines=0, numNoise=nNoise+nExamples) # all noise # seqs = seqs[:nExamples] # only positive examples, no noise for now # seqs = map(lambda s: s + synth.randconst(s.shape, std=.25), seqs) seqs = map(lambda s: ar.downsampleMat(s, rowsBy=downsampleBy), seqs) # length = 40 / downsampleBy # pruneCorr = -1 # mat = sub.similarityMat(seqs, length, pruneCorrAbove=pruneCorr) # plt.figure() # plt.imshow(mat) # plt.show() # return # for s in seqs: # plt.figure() # plt.plot(s) # plt.show() # return # plt.plot(np.vstack(seqs).T) # yep, looks right # plt.show() # simMat = ff2.computeSimMat(seqs, 8, .2, k=10, matForEachSeq=False) # plt.imshow(simMat) # plt.show() # length = 40 / downsampleBy # length = 60 / downsampleBy length = 8 # dMax = length * (1 - minSim) # NOT actually enforcing same threshold dMax = length * .1 # minSim = .5 # not equivalent to above dMax minSim = 0. # not equivalent to above dMax # dMax = length * 0 # Xs = ff2.computeSimMat(seqs, length, dMax, k=(2*length), matForEachSeq=True) Xs = ff2.computeSimMat(seqs, length, dMax, matForEachSeq=True, removeSelfMatch=True) # normFeatures='mean') # works better than znorming # normFeatures='z') # Xnorms = Xs # plt.figure() # plt.imshow(Xs) # plt.show() # return # ------------------------ kill small values Xs = map(lambda X: X * (X > minSim), Xs) # ------------------------ extract relative maxima # Xs = map(lambda X: ff2.localMaxFilterSimMat(X), Xs) # ------------------------ temporal pooling # Xs = map(lambda X: ff2.filterSimMat(X, length-1, 'hamming'), Xs) # Xs = map(lambda X: ff2.filterSimMat(X, length-1, 'flat'), Xs) # ------------------------ normalize mean # use mean for each feature (row) X_combined = np.hstack(Xs) featureMeans = np.mean(X_combined, axis=1).reshape((-1, 1)) Xnorms = map(lambda X: X - featureMeans, Xs) # use grand mean X_combined = np.hstack(Xs) # grandMean = np.mean(X_combined) # Xnorms = map(lambda X: X - grandMean, Xs) # no mean subtraction! # Xnorms = map(lambda X: X, Xs) # use mean for each element (row, col position) # X_mean = np.copy(Xs[0]) # for X in Xs[1:]: # X_mean += X # X_mean /= nExamples # Xnorms = map(lambda X: X - X_mean, Xs) # for X in Xnorms: # print "min", np.min(X) # print "max", np.max(X) # return # ------------------------ normalize variance # Xnorms = map(lambda x: ar.stdNormalizeRows(x), Xnorms) # print X_mean # print np.where(np.isnan(X_mean))[0] # return # for s, m in zip(seqs, Xs): # # for s, m in zip(seqs, Xnorms): # print m.shape # plt.figure() # ax1 = plt.subplot2grid((2,1), (0,0)) # ax2 = plt.subplot2grid((2,1), (1,0)) # ax2.autoscale(tight=True) # ax1.plot(s) # m2 = np.hstack((m, np.zeros((m.shape[0], length-1)))) # ax2.imshow(m2, interpolation='nearest', aspect='auto') # plt.show() # return # ax3.autoscale(tight=True) # ax2.imshow(synth.appendZeros(Xs[0], length-1), interpolation='nearest', aspect='auto') # ax1.set_title("Sequence containing sine wave") # ax2.set_title('Feature Representation of Sequence') # ax3.set_title('Learned Weights') # ax1 = plt.subplot2grid((1,3), (0,0)) # ax2 = plt.subplot2grid((1,3), (0,1)) # ax3 = plt.subplot2grid((1,3), (0,2)) # ax2.autoscale(tight=True) # ax3.autoscale(tight=True) # ax1.plot(seqs[0]) # ax2.imshow(synth.appendZeros(Xs[0], length-1), interpolation='nearest', aspect='auto') # ax1.set_title("Sequence containing sine wave") # ax2.set_title('Feature Representation of Sequence') # ax3.set_title('Learned Weights') # Y = np.empty(len(seqs)) W = np.ones(Xs[0].shape, dtype=np.float64) W /= np.linalg.norm(W) Cov0 = np.zeros(W.shape) + np.mean(np.var( X_combined, axis=1)) # variance of everything ever Cov = np.copy(Cov0) # W += (Xs[0] + Xs[1]) / 2. # lamda = 20. # lamda_scaleBy = 1. lamda_scaleBy = 0. penalty = np.copy(W) * lamda_scaleBy lamda = penalty[0][0] # penalty = np.zeros(Xs[0].shape) + 1 # plt.figure() # plt.imshow(W) nSeqs = len(seqs) ys = np.empty(nSeqs) dWs = np.empty((nSeqs, W.shape[0], W.shape[1])) dCovs = np.empty((nSeqs, W.shape[0], W.shape[1])) y0s = np.empty(nSeqs) # plt.figure() for ep in range(10): # print "w nan at", np.where(np.isnan(W))[0] for i in range(nSeqs): # X = Xs[i] Xnorm = Xnorms[i] # X - E[X] # ys[i] = np.sum(W * X) # print "y", y # dWs[i] = (Xnorm - W) * ys[i] # dW = (Xnorm - W) * np.exp(y) # print "max(X)", np.max(X) # print "max(X-E[X])", np.max(Xnorm) # print "max(dW)", np.max(dW) # ys[i] = np.sum(W * X) # dWs[i] = (X - W) # just vanilla avg # ys[i] = np.sum(W * Xnorm) # dWs[i] = (Xnorm - W) # just vanilla avg diff = Xnorm - W diff_sq = diff * diff ys[i] = np.sum(diff_sq / Cov) dWs[i] = diff dCovs[i] = diff_sq y0s[i] = np.sum(Xnorm * Xnorm / Cov0) # alpha = 1. # probs = np.exp(ys * alpha) # probs /= np.sum(probs) # probs = np.arange(nSeqs) < nExamples sortIdxs = np.argsort(ys)[::-1] # descending order p = float(nExamples) / nSeqs scaleBy = 10 positions = np.linspace(0., 1., nSeqs) betaProbs = beta.sf(positions, p * scaleBy, (1 - p) * scaleBy) ySort = ys[sortIdxs] sigmoid = fitLogistic(ySort) # plt.plot(ySort / ySort[0], 'o') # probs = betaProbs # probs = sigmoid # print ys, y0s gaussDims = np.prod(Xnorms[0].shape) # divideDistsBy = np.sqrt(gaussDims) divideDistsBy = gaussDims probsPat = np.exp(-ys / divideDistsBy) probs0 = np.exp(-y0s / divideDistsBy) probs = probsPat / (probsPat + probs0) probs /= np.sum(probs) # set sum of update weights = 1 for i, p in enumerate(probs): idx = sortIdxs[i] dWs[i] *= probs[idx] dCovs[i] *= probs[idx] dW = np.sum(dWs, axis=0) dCov = np.sum(dCovs, axis=0) # print dW.shape lurn = 1. / (np.sqrt(ep + 1)) # print "lurn", lurn W += lurn * dW covLambda = .95 Cov = covLambda * dCov + (1 - covLambda) * Cov0 # Cov += lurn * dCov # W /= np.linalg.norm(W) # make it zero not quite as much stuff # W /= np.size(W) # W = ff2.l1Project(W) # zeros almost everything ever # print np.sum(np.abs(W)) # W[W < .001 / W.size] = 0 # W -= np.maximum(np.abs(W), penalty) * np.sign(W) # W -= penalty * np.sign(W) # W[np.abs(W) < lamda] = 0. # W = np.maximum(0., W) # TODO proper projection onto L1 ball # W /= np.linalg.norm(W) # L2 constraint # W /= np.sum(W) # L1 constraint print ys # print probs print np.sum(ys) print np.dot(ys[sortIdxs], probs) # oldNLL = NLL NLL = -np.sum(np.log(probs)) # if oldNLL < NLL: # this is far from nondecreasing ATM # print "================================" # print "oldNLL %g < new NLL %g" % (oldNLL, NLL) # print "================================" print "NLL: ", NLL print "------------------------ /iter%d" % (ep + 1) # # logistic function seems to nail the split even better, although # # hard to know what would happen if data weren't so contrived # plt.figure() # # # ySort = ys[sortIdxs] / np.max(ys) # # ySort = ys[sortIdxs] # plt.plot(ySort / ySort[0], 'o') # # sigmoid = fitLogistic(ySort) # plt.plot(sigmoid, label='sigmoid') # plt.plot(betaProbs / np.max(betaProbs), label='beta') # prod = sigmoid * betaProbs # plt.plot(prod / np.max(prod), label='product') # plt.legend(loc='best') # plt.figure() # plt.imshow(W) # ------------------------ reconstruct stuff from time domain # Wscores = W*W / Cov # patScores = np.exp(-Cov) Wsq = W * W # print "Cov0 = ", Cov0[0,0] Wsq[Wsq < Cov0] = 0. # XXX remove hack to kill low weights # Wsq -= Cov0 zeroScores = Wsq / Cov0 # print np.mean(patScores) # print np.mean(zeroScores) # zeroScores = np.exp(-zeroScores) # scoresMat = 1. - patScores / (patScores + zeroScores) scoresMat = zeroScores # these are like identical, suggesting cov is basically proportional # to mean in most cases; apparently just picking big means is probably # better than picking big means with small covs # plt.figure() # plt.imshow(patScores) # plt.figure() # plt.imshow(zeroScores) # plt.colorbar() # print np.min(scoresMat, axis=0) # print np.max(scoresMat, axis=0) # scoresMat[scoresMat < np.max(scoresMat)/2] = 0. # Wscores = np.mean(scoresMat, axis=0) Wscores = np.mean(scoresMat, axis=0) while True: # repeatedly subtract .05 until just one section above 0 idxsAbove0 = np.where(Wscores > 0)[0] changes = idxsAbove0[1:] - idxsAbove0[:-1] if np.all(changes) <= 1 and np.min(Wscores) < 0: break Wscores -= .02 # Wow, this is terrible; really need a way to set this... # ^ perhaps figure out value we'd need to get just one contiguous positive # section somewhere start, end, _ = maxSubarray(Wscores) # patStart, patEnd = start - length/2, end + length/2 # patStart, patEnd = start + length/2, end + length/2 patStart, patEnd = start, end + length # ------------------------ show distro of W # plt.figure() # plt.plot(np.sort(W[W > 0.].flatten()), 'x') # ------------------------ viz learned weights and target seqs mainPlot = 1 if mainPlot: plt.figure(figsize=(10, 7)) # plot sequences (and sum of weights at the top) axSeq1 = plt.subplot2grid((4, 5), (0, 0)) axSeq2 = plt.subplot2grid((4, 5), (0, 1)) axSeq3 = plt.subplot2grid((4, 5), (0, 2)) axWeightSums = plt.subplot2grid((4, 5), (0, 3)) for ax in (axSeq1, axSeq2, axSeq3, axWeightSums): ax.autoscale(tight=True) axSeq1.set_title("Instance #1") axSeq2.set_title("Instance #2") axSeq3.set_title("Instance #3") axWeightSums.set_title("Sum of Weights") axSeq1.plot(seqs[0]) axSeq2.plot(seqs[1]) axSeq3.plot(seqs[2]) # W = ff2.localMaxFilterSimMat(W) # W[W < .01] = 0. W[W < .05] = 0. Wpad = synth.appendZeros(W, length - 1, axis=1) Wsums = np.sum(Wpad, axis=0) axWeightSums.plot(Wsums / np.max(Wsums)) # numNonzerosInCols = np.sum(Wpad > 0., axis=0) + 1. # print numNonzerosInCols # Wmeans = Wsums / numNonzerosInCols # axWeightSums.plot(Wmeans / np.max(Wmeans)) viz.plotRect(axWeightSums, 60 / downsampleBy, 140 / downsampleBy) # plot simMats for sequences axMat1 = plt.subplot2grid((4, 5), (1, 0), rowspan=3) axMat2 = plt.subplot2grid((4, 5), (1, 1), rowspan=3) axMat3 = plt.subplot2grid((4, 5), (1, 2), rowspan=3) axMat4 = plt.subplot2grid((4, 5), (1, 3), rowspan=3) for ax in (axMat1, axMat2, axMat3, axMat4): ax.autoscale(tight=True) for i, ax in enumerate((axMat1, axMat2, axMat3)): ax.set_title("Features {}".format(i)) # ax.plot(seqs[i]) # ax.imshow(W) ax.imshow(synth.appendZeros(Xs[i], length - 1), interpolation='nearest', aspect='auto') axMat4.set_title("Means") axMat4.imshow(synth.appendZeros(W, length - 1), interpolation='nearest', aspect='auto') viz.plotRect(axMat4, 60 / downsampleBy, 140 / downsampleBy) # plot weights of stuff for extraction axScores = plt.subplot2grid((4, 5), (1, 4), rowspan=3) axScores.autoscale(tight=True) axScores.set_title("Scores") axScores.imshow(synth.appendZeros(scoresMat, length - 1), interpolation='nearest', aspect='auto') # plot extracted ts axExtract = plt.subplot2grid((4, 5), (0, 4)) axExtract.autoscale(tight=True) axExtract.set_title("Extracted Subsequences") for s in seqs[:nExamples]: axExtract.plot(s) viz.plotRect(axExtract, patStart, patEnd - 1, color='g') plt.tight_layout(pad=.01) # Wmeans = np.mean(np.abs(W), axis=0) # Wmeans = np.mean(W*W, axis=0) # means = map(lambda X: np.mean(X*X), Xnorm) # mean = reduce(lambda x1, x2: (x1 + x2), means) # mean /= len(Xnorm) # # penalty = np.zeros(len(Wmeans)) + mean # # cumPenalty = np.cumsum(penalty) # Wscores = Wmeans - mean # Wscores -= np.log(.7) - np.log(.3) # difference in log probs of mean vs pattern gauss # Wscores = np.maximum(0, Wscores) # print np.min(Wscores) # print np.max(Wscores) # Wscores[Wscores < np.max(Wscores)/10] = 0. # plt.figure() # plt.imshow(scoresMat) # plt.figure() # # plt.plot(Wmeans) # # plt.gca().ticklabel_format(axis='y', style='plain') # stop being sci notation! # plt.plot(Wscores) # # plt.ylim((np.min(Wscores), 1.)) # start, end, _ = maxSubarray(Wscores) # # end -= 1 # returned end idx isn't inclusive # print "start, end", start, end # viz.plotRect(plt.gca(), start, end-1) # # patStart, patEnd = start - length/2, end + length/2 # # patStart, patEnd = start + length/2, end + length/2 # patStart, patEnd = start, end + length # viz.plotRect(plt.gca(), patStart, patEnd-1, color='g') # # plt.plot(np.cumsum(Wmeans) - cumPenalty) # # plt.plot(np.cumsum(Wmeans[::-1])[::-1] - cumPenalty[::-1]) # plt.figure() # for s in seqs[:nExamples]: # plt.plot(s) # viz.plotRect(plt.gca(), patStart, patEnd-1, color='g') # # plt.plot(ar.meanNormalizeCols(s[patStart:patEnd])) plt.show()
def survival_at_thresh(self, thresh): for prop in self.all_props: hw_cnt = self.hw_cnts[prop] true_cond_t = self.frame_cnts[prop] survival = beta.sf(thresh, hw_cnt + 0.5, hw_cnt - true_cond_t + 0.5) yield prop, survival