def order_pieces_prob(face, pieces, vtxsum, vtxocc): from utils import hist mul = operator.mul hists = {v:hist(v-vtxsum[v], 5-vtxocc[v]) for v in face} opieces = [] for piece, rots in pieces.items(): for rot in rots: rpiece = rot_piece(piece, rot) score = reduce(mul, (hists[v][p] for v,p in zip(face, rpiece)), 1.0) assert 0 <= score <= 1.0 opieces.append((score, piece, rpiece)) opieces.sort(reverse=True) return opieces
def order_pieces_prob(face, pieces, vtxsum, vtxocc): from utils import hist mul = operator.mul hists = {v: hist(v - vtxsum[v], 5 - vtxocc[v]) for v in face} opieces = [] for piece, rots in pieces.items(): for rot in rots: rpiece = rot_piece(piece, rot) score = reduce(mul, (hists[v][p] for v, p in zip(face, rpiece)), 1.0) assert 0 <= score <= 1.0 opieces.append((score, piece, rpiece)) opieces.sort(reverse=True) return opieces
def stats(): """ return some statistical data to be viewed in a table """ year_list = df.columns[2:-1] # list of info to be viewed in a table table = [] table.append([ "TOTAL NUMBER OF MIGRANTS", "{:,.0f}".format(df["sum"].sum()), "Over a Period of 73 Years From 1945 to 2018" ]) table.append([ "AVERAGE NUMBER OF MIGRANTS", "{:,.0f}".format(df["sum"].sum() / 73), "-" ]) table.append([ "MINIMUM NUMBER OF MIGRANTS FROM A COUNTRY", "{:,.0f}".format(df["sum"].min()), "'Chad' from 1945 to 2018" ]) table.append([ "MAXIMUM NUMBER OF MIGRANTS FROM A COUNTRY", "{:,.0f}".format(df["sum"].max()), "'UK & Ireland' from 1945 to 2018" ]) table.append([ "MINIMUM NUMBER OF MIGRANTS IN A YEAR", "{:,.0f}".format(df[year_list].sum(axis=0).min()), "Almost Two Years from Oct 1945 to Jun 1947" ]) table.append([ "MAXIMUM NUMBER OF MIGRANTS IN A YEAR", "{:,.0f}".format(df[year_list].sum(axis=0).max()), "from 2012 to 2013" ]) table.append([ "MAXIMUM NUMBER OF MIGRANTS FROM A SINGLE COUNTRY AND IN A YEAR", "{:,.0f}".format(df[year_list].max().max()), "UK & Ireland from 1968 to 1969 " ]) return render_template("stats.html", table=table, image=hist(df))
grid = np.vstack([X, Y]).reshape(2, -1) # Trained model axScatter.contour(X, Y, M(grid).reshape(X.shape[0], Y.shape[0]), colors='blue', linestyles='--') # Conditional plots axHistx1.plot(xx, M1(xx.reshape(1, 120)).flatten(), color='b', linestyle='-') axHistx2.plot(xx, M2(xx.reshape(1, 120)).flatten(), color='b', linestyle='-') axHistx3.plot(xx, M3(xx.reshape(1, 120)).flatten(), color='b', linestyle='-') # Conditional histograms ( i.e. empirical conditional) axHistx1.hist(ut.hist(data, y_slice[0]), bins='auto', density=True, color='lightgray') axHistx2.hist(ut.hist(data, y_slice[1]), bins='auto', density=True, color='lightgray') axHistx3.hist(ut.hist(data, y_slice[2]), bins='auto', density=True, color='lightgray') plt.savefig('gaussian-mix.png', bbox_inches='tight') print('# Image saved at ./gaussian-mix.png')
def doc_word_embed_content_noise(content_path, noise_path, whiten_path=None, content_lines=None, noise_lines=None, opt=None): no_add_set = set() doc_word_embed_f = doc_word_embed_sen content_words_ar, content_word_embeds = doc_word_embed_f( content_path, no_add_set, content_lines=content_lines) words_set = set(content_words_ar) noise_words_ar, noise_word_embeds = doc_word_embed_f( noise_path, set(content_words_ar), content_lines=noise_lines) content_words_ar.extend(noise_words_ar) words_ar = content_words_ar word_embeds = torch.cat((content_word_embeds, noise_word_embeds), dim=0) whitening = opt.whiten if opt is not None else True #True #April, temporary normalize by inlier covariance! if whitening and whiten_path is not None: #use an article of data in the inliers topic to whiten data. whiten_ar, whiten_word_embeds = doc_word_embed_f( whiten_path, set() ) #, content_lines=content_lines)#,content_lines=content_lines) ######april!! whiten_cov = utils.cov(whiten_word_embeds) fast_whiten = False #True if not fast_whiten: U, D, V_t = linalg.svd(whiten_cov) #D_avg = D.mean() #D[len(D)//2] #print('D_avg! {}'.format(D_avg)) cov_inv = torch.from_numpy( np.matmul(linalg.pinv(np.diag(np.sqrt(D))), U.transpose())).to(utils.device) #cov_inv = torch.from_numpy(np.matmul(U, np.matmul(linalg.pinv(np.diag(np.sqrt(D))), V_t))).to(utils.device) word_embeds0 = word_embeds #change multiplication order! word_embeds = torch.mm(cov_inv, word_embeds.t()).t() if False: after_cov = utils.cov(word_embeds) U1, D1, V_t1 = linalg.svd(after_cov) pdb.set_trace() content_whitened = torch.mm(cov_inv, content_word_embeds.t()).t() after_cov2 = utils.cov(content_whitened) _, D1, _ = linalg.svd(after_cov2) print('after whitening D {}'.format(D1[:7])) else: #### faster whitening sv = decom.TruncatedSVD(30) sv.fit(whiten_cov.cpu().numpy()) top_evals, top_evecs = sv.singular_values_, sv.components_ top_evals = torch.from_numpy(1 / np.sqrt(top_evals)).to( utils.device) top_evecs = torch.from_numpy(top_evecs).to(utils.device) #pdb.set_trace() X = word_embeds projected = torch.mm(top_evecs.t() / (top_evecs**2).sum(-1), torch.mm(top_evecs, X.t())).t() #eval_ones = torch.eye(len(top_evals), device=top_evals.device) ##projected = torch.mm(torch.mm(top_evecs.t(), eval_ones), torch.mm(top_evecs, X.t())).t() #(d x k) * (k x d) * (d x n), project onto and squeeze the components along top evecs ##word_embeds = torch.mm((top_evecs/top_evals.unsqueeze(-1)).t(), torch.mm(top_evecs, X.t())).t() + (X-torch.mm(top_evecs.t(), torch.mm(top_evecs, X.t()) ).t()) #pdb.set_trace() ##word_embeds = torch.mm((top_evecs/(top_evals*(top_evecs**2).sum(-1)).unsqueeze(-1)).t(), torch.mm(top_evecs, X.t())).t() + (X-projected ) #word_embeds = torch.mm((top_evecs/(top_evals*(top_evecs**2).sum(-1)).unsqueeze(-1)).t(), torch.mm(top_evecs, X.t())).t() + (X-projected ) word_embeds = torch.mm(torch.mm(top_evecs.t(), top_evals.diag()), torch.mm(top_evecs, X.t())).t() + (X - projected) noise_idx = torch.LongTensor( list(range(len(content_word_embeds), len(word_embeds)))).to(utils.device) if False: #normalie per direction word_embeds_norm = ((word_embeds - word_embeds.mean(0))**2).sum( dim=1, keepdim=True).sqrt() debug_top_dir = False if debug_top_dir: w1 = (content_word_embeds - word_embeds.mean(0) ) #/word_embeds_norm[:len(content_word_embeds)] w2 = (noise_word_embeds - word_embeds.mean(0) ) #/word_embeds_norm[len(content_word_embeds):] mean_diff = ((w1.mean(0) - w2.mean(0))**2).sum().sqrt() w1_norm = (w1**2).sum(-1).sqrt().mean() w2_norm = (w2**2).sum(-1).sqrt().mean() X = (word_embeds - word_embeds.mean(0)) #/word_embeds_norm cov = torch.mm(X.t(), X) / word_embeds.size(0) U, D, V_t = linalg.svd(cov.cpu().numpy()) U1 = torch.from_numpy(U[1]).to(utils.device) mean1_dir = w1.mean(0) mean1_proj = (mean1_dir * U1).sum() mean2_dir = w2.mean(0) mean2_proj = (mean2_dir * U1).sum() diff_proj = ((mean1_dir - mean2_dir) * U1).sum() #plot histogram of these projections proj1 = (w1 * U1).sum(-1) proj2 = (w2 * U1).sum(-1) utils.hist(proj1, 'inliers') utils.hist(proj2, 'outliers') pdb.set_trace() #word_embeds=(word_embeds - word_embeds.mean(0))/word_embeds_norm return words_ar, word_embeds, noise_idx
# Data 2D histogram axScatter.hist2d(data[:,0], data[:,1], normed=True, bins=[100, 50], cmap='binary') # Analytic pdf axScatter.contour(X,Y,pdf.reshape(X.shape[0], Y.shape[0]), colors='red', linestyles='--') # Trained model axScatter.contour(X, Y, M(grid).reshape(X.shape[0], Y.shape[0]), colors='blue', linestyles='--') # Conditional plots # Analytic axHistx1.plot(xx, cpdf1, color='r', linestyle='--') axHistx2.plot(xx, cpdf2, color='r', linestyle='--') axHistx3.plot(xx, cpdf3, color='r', linestyle='--') # Models axHistx1.plot(xx, cM1(xx.reshape(1,n)).flatten(), color='b', linestyle='--') axHistx2.plot(xx, cM2(xx.reshape(1,n)).flatten(), color='b', linestyle='--') axHistx3.plot(xx, cM3(xx.reshape(1,n)).flatten(), color='b', linestyle='--') # Conditional histograms ( i.e. empirical conditional) axHistx1.hist(ut.hist(data, x[0], axes='y'), bins='auto', density=True, color='lightgray') axHistx2.hist(ut.hist(data, x[1], axes='y'), bins='auto', density=True, color='lightgray') axHistx3.hist(ut.hist(data, x[2], axes='y'), bins='auto', density=True, color='lightgray') plt.savefig('student-t.png', bbox_inches='tight') print('# Image saved at ./student-t.png')
def lnlike(self, p): lnlike = 0.0 # need to generalize the following!! if self.func is not None: lnlike += np.sum(np.log(self.func(self.data[:,0], self.data[:,1], \ 10.0**p[0], 10.0**p[1]))) else: try: if self.interpType == 'linear1d': # For Linear Interpolation # ------------------------ pdf = self.dataComp.rotate2full(np.array([self.interp[jj](self.sampTrans.range2unit(10.0**p)) for jj in range(len(self.interp))]).flatten()) elif self.interpType == 'gp1d': # For 1D GP # ---------- if not self.interpErrors: pdf = self.dataComp.rotate2full(np.array([self.interp[jj].predict(self.dataComp.pca_weights[jj,:], self.sampTrans.range2unit(np.atleast_2d(10.0**p)))[0][0] for jj in range(len(self.interp))])) elif self.interpErrors: if not self.interpHyperErrors: pdf = self.dataComp.rotate2full(np.array([self.interp[jj].sample_conditional(self.dataComp.pca_weights[jj,:], self.sampTrans.range2unit(np.atleast_2d(10.0**p))) for jj in range(len(self.interp))]).flatten()) elif self.interpHyperErrors: # drawing new kernel hyperparameters from posterior [self.interp[jj].set_parameter_vector(random.choice( self.gp_kernel_posterior[jj] / np.log10(np.e))) for jj in range(len(self.interp))] # sampling conditional, as before pdf = self.dataComp.rotate2full(np.array([self.interp[jj].sample_conditional(self.dataComp.pca_weights[jj,:], self.sampTrans.range2unit(np.atleast_2d(10.0**p))) for jj in range(len(self.interp))]).flatten()) elif self.interpType == 'gp2d': # For 2D GP # ---------- xrot = np.zeros((self.dataComp.user_dim,self.dataComp.user_dim2)) for ii in range(self.dataComp.user_dim): for jj in range(self.dataComp.user_dim2): if not self.interpErrors: xrot[ii,jj] = self.interp[ii][jj].predict(self.dataComp.unitCore[ii,jj,:], self.sampTrans.range2unit([10.0**p]))[0][0] elif self.interpErrors: if not self.interpHyperErrors: xrot[ii,jj] = self.interp[ii][jj].sample_conditional(self.dataComp.unitCore[ii,jj,:], self.sampTrans.range2unit([10.0**p])) elif self.interpHyperErrors: # drawing new kernel hyperparameters from posterior self.interp[ii][jj].set_parameter_vector(random.choice( self.gp_kernel_posterior[ii][jj] / np.log10(np.e))) # sampling conditional, as before xrot[ii,jj] = self.interp[ii][jj].sample_conditional(self.dataComp.unitCore[ii,jj,:], self.sampTrans.range2unit([10.0**p])) pdf = self.dataComp.rotate2full(xrot).flatten(order='F') # did you train on the distribution ('linear') or # 'log10' of the distribution. if self.interpScale == 'linear': pdf = pdf elif self.interpScale == 'log10': pdf = 10.0**pdf # construct normalized PDF pdf = utils.hist(self.x, pdf) # query PDF at data locations if self.catalogType == 'median': pdf_val = pdf.pdf(self.data) elif self.catalogType == 'samples': try: pdf_val = np.mean(pdf.pdf(self.data),axis=0) except ValueError: # array indexing: [sample, source, parameter] pdf_val = np.mean([pdf.pdf(self.data[kk]) for kk in range(self.data.shape[0])], axis=0) # incoporate expected rate information if self.rate_interp is not None: # rate = self.rate_interp.predict(self.rate_data, # self.sampTrans.range2unit(np.atleast_2d(10.0**p)))[0][0] if not self.interpErrors: rate = self.rate_interp.predict(self.rate_data, self.sampTrans.range2unit(np.atleast_2d(10.0**p)))[0][0] elif self.interpErrors: if not self.interpHyperErrors: rate = self.rate_interp.sample_conditional(self.rate_data, self.sampTrans.range2unit(np.atleast_2d(10.0**p))) elif self.interpHyperErrors: # drawing new kernel hyperparameters from posterior self.rate_interp.set_parameter_vector(random.choice( self.rate_gp_kernel_posterior/ np.log10(np.e))) rate = self.rate_interp.sample_conditional(self.rate_data, self.sampTrans.range2unit(np.atleast_2d(10.0**p))) rate = 10.0**(self.rate_mean + self.rate_std * rate) pdf_val *= rate lnlike += np.sum(np.log(pdf_val)) if self.rate_interp is not None: if self.poisson_marg: lnlike -= (1.0 + self.data.shape[0]) * np.log(rate) if rate < 1e-5: lnlike = -np.inf elif not self.poisson_marg: lnlike -= rate if np.isnan(lnlike): lnlike = -np.inf except np.linalg.LinAlgError: lnlike = -np.inf return lnlike