def test_dual_variables(): n = 5000 # nb bins m = 6000 # nb bins mean1 = 1000 mean2 = 1100 # bin positions x = np.arange(n, dtype=np.float64) y = np.arange(m, dtype=np.float64) # Gaussian distributions a = gauss(n, m=mean1, s=5) # m= mean, s= std b = gauss(m, m=mean2, s=10) # loss matrix M = ot.dist(x.reshape((-1, 1)), y.reshape((-1, 1))) ** (1. / 2) print('Computing {} EMD '.format(1)) # emd loss 1 proc ot.tic() G, log = ot.emd(a, b, M, log=True) ot.toc('1 proc : {} s') ot.tic() G2 = ot.emd(b, a, np.ascontiguousarray(M.T)) ot.toc('1 proc : {} s') cost1 = (G * M).sum() # Check symmetry np.testing.assert_array_almost_equal(cost1, (M * G2.T).sum()) # Check with closed-form solution for gaussians np.testing.assert_almost_equal(cost1, np.abs(mean1 - mean2)) # Check that both cost computations are equivalent np.testing.assert_almost_equal(cost1, log['cost']) check_duality_gap(a, b, M, G, log['u'], log['v'], log['cost'])
def test_tic_toc(): import time ot.tic() time.sleep(0.5) t = ot.toc() t2 = ot.toq() # test timing np.testing.assert_allclose(0.5, t, rtol=1e-2, atol=1e-2) # test toc vs toq np.testing.assert_allclose(t, t2, rtol=1e-2, atol=1e-2)
def test_emd2_multi(): n = 500 # nb bins # bin positions x = np.arange(n, dtype=np.float64) # Gaussian distributions a = gauss(n, m=20, s=5) # m= mean, s= std ls = np.arange(20, 500, 100) nb = len(ls) b = np.zeros((n, nb)) for i in range(nb): b[:, i] = gauss(n, m=ls[i], s=10) # loss matrix M = ot.dist(x.reshape((n, 1)), x.reshape((n, 1))) # M/=M.max() print('Computing {} EMD '.format(nb)) # emd loss 1 proc ot.tic() emd1 = ot.emd2(a, b, M, 1) ot.toc('1 proc : {} s') # emd loss multipro proc ot.tic() emdn = ot.emd2(a, b, M) ot.toc('multi proc : {} s') np.testing.assert_allclose(emd1, emdn) # emd loss multipro proc with log ot.tic() emdn = ot.emd2(a, b, M, log=True, return_matrix=True) ot.toc('multi proc : {} s') for i in range(len(emdn)): emd = emdn[i] log = emd[1] cost = emd[0] check_duality_gap(a, b[:, i], M, log['G'], log['u'], log['v'], cost) emdn[i] = cost emdn = np.array(emdn) np.testing.assert_allclose(emd1, emdn)
def test_emd2_multi(): n = 1000 # nb bins # bin positions x = np.arange(n, dtype=np.float64) # Gaussian distributions a = gauss(n, m=20, s=5) # m= mean, s= std ls = np.arange(20, 1000, 20) nb = len(ls) b = np.zeros((n, nb)) for i in range(nb): b[:, i] = gauss(n, m=ls[i], s=10) # loss matrix M = ot.dist(x.reshape((n, 1)), x.reshape((n, 1))) # M/=M.max() print('Computing {} EMD '.format(nb)) # emd loss 1 proc ot.tic() emd1 = ot.emd2(a, b, M, 1) ot.toc('1 proc : {} s') # emd loss multipro proc ot.tic() emdn = ot.emd2(a, b, M) ot.toc('multi proc : {} s') np.testing.assert_allclose(emd1, emdn) # emd loss multipro proc with log ot.tic() emdn = ot.emd2(a, b, M, log=True, return_matrix=True) ot.toc('multi proc : {} s') for i in range(len(emdn)): emd = emdn[i] log = emd[1] cost = emd[0] check_duality_gap(a, b[:, i], M, log['G'], log['u'], log['v'], cost) emdn[i] = cost emdn = np.array(emdn) np.testing.assert_allclose(emd1, emdn)
def get_MSE(dataset_name, emd, repo): _, _, test=get_data(dataset_name, repo) xtest1, xtest2, ytest = test ot.tic() ytest_pred=emd.predict([xtest1,xtest2]) t_est=ot.toc() err=np.mean(np.square(ytest_pred.ravel()-ytest.ravel())) errr=np.mean(np.square(ytest_pred.ravel()-ytest.ravel()))/np.mean(np.square(ytest.ravel())) r=np.corrcoef(ytest.ravel(),ytest_pred.ravel())[0,1] # compute quantiles nbin=30 yp_mean=np.zeros((nbin,)) yp_10=np.zeros((nbin,)) yp_90=np.zeros((nbin,)) yp_plot=np.zeros((nbin,)) hst,bins=np.histogram(ytest[:],nbin) yp_plot[:]=np.array([.5*bins[k]+.5*bins[k+1] for k in range(nbin)]) for j in range(nbin): idx=np.where((ytest[:]>bins[j]) * (ytest[:]<bins[j+1]) ) ytemp=ytest_pred[idx] if ytemp.any(): yp_mean[j]=ytemp.mean() yp_10[j]=np.percentile(ytemp,10) yp_90[j]=np.percentile(ytemp,90) else: yp_mean[j]=np.nan yp_10[j]=np.nan yp_90[j]=np.nan print('MSE={}\nRel MSE={}\nr={}\nEMD/s={}'.format(err,errr,r,ytest_pred.shape[0]/t_est)) pl.figure(1,(8,3)) pl.clf() pl.plot([0,45],[0,45],'k') xl=pl.axis() pl.plot(ytest,ytest_pred,'+') pl.plot([0,45],[0,45],'k') pl.axis(xl) pl.xlim([0,45]) pl.ylim([0,45]) pl.xlabel('True Wass. distance') pl.ylabel('Predicted Wass. distance') pl.title('True and predicted Wass. distance') pl.legend(('Exact prediction','Model prediction')) pl.savefig('imgs/{}_emd_pred_true.png'.format(dataset_name),dpi=300) pl.savefig('imgs/{}_emd_pred_true.pdf'.format(dataset_name)) pl.subplot(1,2,2) pl.plot([ytest[:].min() ,ytest[:].max() ],[ytest[:].min() ,ytest[:].max() ],'k') pl.plot(yp_plot[:],yp_mean[:],'r+-') pl.plot(yp_plot[:],yp_10[:],'g+-') pl.plot(yp_plot[:],yp_90[:],'b+-') pl.xlim([0,45]) pl.ylim([0,45]) pl.legend(('Exact prediction','Mean pred','10th percentile','90th precentile',)) pl.title('{} MSE:{:3.2f}, RelMSE:{:3.3f}, Corr:{:3.3f}'.format('',err,errr,r)) pl.grid() pl.xlabel('True Wass. distance') pl.ylabel('Predicted Wass. distance') pl.savefig('imgs/{}_emd_pred_true_quantile.png'.format(dataset_name),dpi=300) pl.savefig('imgs/{}_perf.png'.format(dataset_name),dpi=300,bbox_inches='tight') pl.savefig('imgs/{}_perf.pdf'.format(dataset_name),dpi=300,bbox_inches='tight')
def get_MSE(dataset_name, emd, repo): _, _, test = get_data(dataset_name, repo) xtest1, xtest2, ytest = test ot.tic() ytest_pred = emd.predict([xtest1, xtest2]) t_est = ot.toc() err = np.mean(np.square(ytest_pred.ravel() - ytest.ravel())) errr = np.mean(np.square(ytest_pred.ravel() - ytest.ravel())) / np.mean( np.square(ytest.ravel())) r = np.corrcoef(ytest.ravel(), ytest_pred.ravel())[0, 1] # compute quantiles nbin = 30 yp_mean = np.zeros((nbin, )) yp_10 = np.zeros((nbin, )) yp_90 = np.zeros((nbin, )) yp_plot = np.zeros((nbin, )) hst, bins = np.histogram(ytest[:], nbin) yp_plot[:] = np.array( [.5 * bins[k] + .5 * bins[k + 1] for k in range(nbin)]) for j in range(nbin): idx = np.where((ytest[:] > bins[j]) * (ytest[:] < bins[j + 1])) ytemp = ytest_pred[idx] if ytemp.any(): yp_mean[j] = ytemp.mean() yp_10[j] = np.percentile(ytemp, 10) yp_90[j] = np.percentile(ytemp, 90) else: yp_mean[j] = np.nan yp_10[j] = np.nan yp_90[j] = np.nan print('MSE={}\nRel MSE={}\nr={}\nEMD/s={}'.format( err, errr, r, ytest_pred.shape[0] / t_est)) if not os.path.exists('imgs'): os.makedirs('imgs') pl.figure(1, (8, 3)) pl.clf() pl.plot([0, 45], [0, 45], 'k') xl = pl.axis() pl.plot(ytest, ytest_pred, '+') pl.plot([0, 45], [0, 45], 'k') pl.axis(xl) pl.xlim([0, 45]) pl.ylim([0, 45]) pl.xlabel('True Wass. distance') pl.ylabel('Predicted Wass. distance') pl.title('True and predicted Wass. distance') pl.legend(('Exact prediction', 'Model prediction')) pl.savefig('imgs/{}_emd_pred_true.png'.format(dataset_name), dpi=300) pl.savefig('imgs/{}_emd_pred_true.pdf'.format(dataset_name)) pl.subplot(1, 2, 2) pl.plot([ytest[:].min(), ytest[:].max()], [ytest[:].min(), ytest[:].max()], 'k') pl.plot(yp_plot[:], yp_mean[:], 'r+-') pl.plot(yp_plot[:], yp_10[:], 'g+-') pl.plot(yp_plot[:], yp_90[:], 'b+-') pl.xlim([0, 45]) pl.ylim([0, 45]) pl.legend(( 'Exact prediction', 'Mean pred', '10th percentile', '90th precentile', )) pl.title('{} MSE:{:3.2f}, RelMSE:{:3.3f}, Corr:{:3.3f}'.format( '', err, errr, r)) pl.grid() pl.xlabel('True Wass. distance') pl.ylabel('Predicted Wass. distance') pl.savefig('imgs/{}_emd_pred_true_quantile.png'.format(dataset_name), dpi=300) pl.savefig('imgs/{}_perf.png'.format(dataset_name), dpi=300, bbox_inches='tight') pl.savefig('imgs/{}_perf.pdf'.format(dataset_name), dpi=300, bbox_inches='tight')
pl.title('Distributions') pl.tight_layout() #%% barycenter computation alpha = 0.5 # 0<=alpha<=1 weights = np.array([1 - alpha, alpha]) # l2bary bary_l2 = A.dot(weights) # wasserstein reg = 1e-3 ot.tic() bary_wass = ot.bregman.barycenter(A, M, reg, weights) ot.toc() ot.tic() bary_wass2 = ot.lp.barycenter(A, M, weights, solver='interior-point', verbose=True) ot.toc() pl.figure(2) pl.clf() pl.subplot(2, 1, 1) for i in range(n_distributions): pl.plot(x, A[:, i]) pl.title('Distributions')
# bin positions x = np.arange(n, dtype=np.float64) # Gaussian distributions a = gauss(n, m=20, s=5) # m= mean, s= std ls = list(range(20, 1000, 10)) nb = len(ls) b = np.zeros((n, nb)) for i in range(nb): b[:, i] = gauss(n, m=ls[i], s=10) # loss matrix M = ot.dist(x.reshape((n, 1)), x.reshape((n, 1))) #M/=M.max() #%% print(('Computing {} EMD '.format(nb))) # emd loss 1 proc ot.tic() emd_loss4 = ot.emd2(a, b, M, 1) ot.toc('1 proc : {} s') # emd loss multipro proc ot.tic() emd_loss4 = ot.emd2(a, b, M) ot.toc('multi proc : {} s')
#print('mean perf',np.mean(r)) # --------------------III. run experiments--------------------------------- # ------------------- COT ----------------------------------------------- ot.tic() Tv, Tc, cost = cot_numpy(Sx, Tx, niter=100, C_lin=M_lin.T, algo='sinkhorn', reg=1e0, algo2='emd', verbose=False) time_COT.append(ot.toc()) yt_onehot = enc.fit_transform(Ty.reshape(-1, 1)) ys_onehot_estimated = Tv.shape[0] * np.dot(Tv, yt_onehot) ys_estimated = enc.inverse_transform(ys_onehot_estimated).reshape( -1) perf = 100 * np.mean(Sy[idx_inv] == ys_estimated[idx_inv]) perf_COT.append(perf) print('Accuracy COT labelprop: {:.2f}'.format(perf)) if n_samples != 0: print('mean perf baseline= {:.2f} ({:.2f})'.format( np.mean(perf_baseline), np.std(perf_baseline))) print('mean perf COT= {:.2f} ({:.2f})'.format(np.mean(perf_COT), np.std(perf_COT)))