def performance(tol): """ ============================================================================ Assesses the performance of Bracket Descent and Scipy L-BFGS-B Methods ============================================================================ Parameters ------------ tol : float Determines the tolerance for minimization Returns ------------ This function will produce 4 figures. The first 3 will represent a comparison of the precison of each method while the 4th will represent a comparison of the timing. The first three show the location of the computed minima for initial guesses of [-100,-3], [-50,-3], [-10,-3] and [-1,-3]. These are overlayed onto the original cost function; the Scipy L-BFGS-B results are represented by red diamonds while the Bracket Descent results are represented by blue diamonds. The three figures represent the cases when the noise amplitude is set to 0, 1, and 10. The final figure consists of four subplots, the upper row represents the computational time taken for convergence, given an initial x starting point, while the lower represents the number of iterations requried. In each case the Scipy L-BFGS-B method is shown on the left and the Bracket descent is shown on the right. A legend on each plot differentiates the cases when the Noise Ampplitude is set to 0, 1, and 10. Trends Observed ---------------- For all cases, the Scipy minimization function appears to be more consistent (to rely less on the initial guess) than the fortran Bracket Descent method. This is seen in figures hw241-hw243, where the B-D results are seen to cover a broader spead of final coordinated. These figures also illustrate that as the level of noise of the cost function is increased, the Scipy L-BFGS-B method becomes increasingly favourable over the Bracket descent approach, producing more precise results each time. This is a result of the lack of consideration for noise within the Bracket Descent method; that is to say that any random fluctations which result in two neighbouring points (along the convergence path) lying within the tolerance limit will be assumed to be the true minimum of the function as defined by the B-D method. However, it is likely that the Scipy L-BFGS-B method is adapted to smooth out noisy functions and hence find the true minimum more reliably. A consideration of figure hw244, however, demonstrates an advantage of the B-D method over the Scipy L-BGFS-B minimization in the form of timing. It can be seen that despite requiring more iterations before converging to within a set tolerance, the total computational time is less to within a factor of 10. """ plt.close('all') count = 0 hw2.tol = tol nintb = [] nintl = [] tlbfgsb = [] txfbd = [] lbfgsx = [] lbfgsy = [] xfbdx = [] xfbdy = [] cost.c_noise = True for cost.c_noise_amp in [0., 1., 10.]: count = count + 1 for [x, y] in [[-100., -3.], [-50., -3.], [-10., -3.], [-1., -3.]]: t12 = 0 t34 = 0 for i in range(0, 1000): t1 = time() scipy.optimize.minimize(cost.costj, [x, y], method='L-BFGS-B', tol=tol) t2 = time() t12 = t12 + (t2 - t1) t3 = time() hw2.bracket_descent([x, y]) t4 = time() t34 = t34 + (t4 - t3) tlbfgsb.append(t12 / 1000) txfbd.append(t34 / 1000) info = scipy.optimize.minimize(cost.costj, [x, y], method='L-BFGS-B', tol=tol) xfbd, jfbd, i2 = hw2.bracket_descent([x, y]) # print('method: ', 'Fortran Bracket Descent') # print('Value: ', jfbd) # print('number of iterations:', i2) # print('x: ', xfbd) # print('c_noise: ', cost.c_noise) # print(' ') #print(info) x = info.x lbfgsx.append(x[0]) lbfgsy.append(x[1]) xfbdx.append(xfbd[0]) xfbdy.append(xfbd[1]) nint = info.nit nintl.append(nint) nintb.append(i2) Minx = 1 + (min([ min(xfbdx[(count - 1) * 4:count * 4]), min(lbfgsx[(count - 1) * 4:count * 4]) ]) - 1) * 1.1 Maxx = 1 + (max([ max(xfbdx[(count - 1) * 4:count * 4]), max(lbfgsx[(count - 1) * 4:count * 4]) ]) - 1) * 1.1 Miny = 1 + (min([ min(xfbdy[(count - 1) * 4:count * 4]), min(lbfgsy[(count - 1) * 4:count * 4]) ]) - 1) * 1.1 Maxy = 1 + (max([ max(xfbdy[(count - 1) * 4:count * 4]), max(lbfgsy[(count - 1) * 4:count * 4]) ]) - 1) * 1.1 [X, Y] = np.linspace(Minx, Maxx, 200), np.linspace(Miny, Maxy, 200) #calculate noiseless cost function at each point on 2D grid j = [[cost.costj([xi, yi]) for xi in X] for yi in Y] #create contour plots of cost functions with and without noise fig, p4 = plt.subplots() cp = p4.contourf(X, Y, j, locator=ticker.LogLocator(), cmap=cm.GnBu) cbar = fig.colorbar(cp) BD, = p4.plot(xfbdx[(count - 1) * 4:count * 4], xfbdy[(count - 1) * 4:count * 4], 'b', linestyle='None', marker='d', markersize=6) Scipy, = p4.plot(lbfgsx[(count - 1) * 4:count * 4], lbfgsy[(count - 1) * 4:count * 4], 'r', linestyle='None', marker='d', markersize=6) BD.set_label('Fortran Bracket Descent') Scipy.set_label('Scipy optimize L-BFGS-B') plt.legend(loc='upper left', fontsize='small') plt.suptitle( 'Rosemary Teague, performance \n Comparison of converged values, Noise=' + str(int(cost.c_noise_amp))) #plt.tight_layout(pad=5) plt.savefig('hw24' + str(count), dpi=700) print(tlbfgsb) plt.close('all') f4, (p414, p424) = plt.subplots(2, 2, sharey=True) one, = p414[0].plot( tlbfgsb[:4], [np.abs(-100.), np.abs(-50.), np.abs(-10.), np.abs(-1.)], 'r', marker='x', markersize=12) two, = p414[0].plot( tlbfgsb[4:8], [np.abs(-100.), np.abs(-50.), np.abs(-10.), np.abs(-1.)], 'm', marker='x', markersize=12) three, = p414[0].plot( tlbfgsb[8:], [np.abs(-100.), np.abs(-50.), np.abs(-10.), np.abs(-1.)], '#c79fef', marker='x', markersize=12) one.set_label('No Noise') two.set_label('Noise = 1.0') three.set_label('Noise = 10.0') p414[0].set_title('Scipy Optimise L-BFGS-B') p414[0].set_xlabel('Time Taken') p414[0].legend(loc='upper right', fontsize='x-small') p414[0].xaxis.set_ticks(np.linspace(min(tlbfgsb), max(tlbfgsb), 3)) p414[0].ticklabel_format(useOffset=False) uno, = p414[1].plot(txfbd[:4], [ np.abs(-100. - xfbdx[0]), np.abs(-50. - xfbdx[1]), np.abs(-10. - xfbdx[2]), np.abs(-1. - xfbdx[3]) ], 'b', marker='x', markersize=12) dos, = p414[1].plot(txfbd[4:8], [ np.abs(-100. - xfbdx[4]), np.abs(-50. - xfbdx[5]), np.abs(-10. - xfbdx[6]), np.abs(-1. - xfbdx[7]) ], 'g', marker='x', markersize=12) tres, = p414[1].plot(txfbd[8:], [ np.abs(-100. - xfbdx[8]), np.abs(-50. - xfbdx[9]), np.abs(-10. - xfbdx[10]), np.abs(-1. - xfbdx[11]) ], 'c', marker='x', markersize=12) uno.set_label('No Noise') dos.set_label('Noise = 1.0') tres.set_label('Noise = 10.0') p414[1].set_title('Fortran Bracket Descent') p414[1].set_xlabel('Time Taken') p414[1].legend(loc='upper left', fontsize='x-small') p414[1].xaxis.set_ticks(np.linspace(min(txfbd), max(txfbd), 3)) one1, = p424[0].plot(nintl[:4], [ np.abs(-100. - lbfgsx[0]), np.abs(-50. - lbfgsx[1]), np.abs(-10. - lbfgsx[2]), np.abs(-1. - lbfgsx[3]) ], 'r', marker='x', markersize=12) two2, = p424[0].plot(nintl[4:8], [ np.abs(-100. - lbfgsx[4]), np.abs(-50. - lbfgsx[5]), np.abs(-10. - lbfgsx[6]), np.abs(-1. - lbfgsx[7]) ], 'm', marker='x', markersize=12) three3, = p424[0].plot(nintl[8:], [ np.abs(-100. - lbfgsx[8]), np.abs(-50. - lbfgsx[9]), np.abs(-10. - lbfgsx[10]), np.abs(-1. - lbfgsx[11]) ], '#c79fef', marker='x', markersize=12) one1.set_label('No Noise') two2.set_label('Noise = 1.0') three3.set_label('Noise = 10.0') p424[0].set_xlabel('Number of Iterations') p424[0].legend(loc='upper left', fontsize='x-small') p424[0].ticklabel_format(useOffset=False) uno1, = p424[1].plot(nintb[:4], [ np.abs(-100. - xfbdx[0]), np.abs(-50. - xfbdx[1]), np.abs(-10. - xfbdx[2]), np.abs(-1. - xfbdx[3]) ], 'b', marker='x', markersize=12) dos2, = p424[1].plot(nintb[4:8], [ np.abs(-100. - xfbdx[4]), np.abs(-50. - xfbdx[5]), np.abs(-10. - xfbdx[6]), np.abs(-1. - xfbdx[7]) ], 'g', marker='x', markersize=12) tres3, = p424[1].plot(nintb[8:], [ np.abs(-100. - xfbdx[8]), np.abs(-50. - xfbdx[9]), np.abs(-10. - xfbdx[10]), np.abs(-1. - xfbdx[11]) ], 'c', marker='x', markersize=12) uno1.set_label('No Noise') dos2.set_label('Noise = 1.0') tres3.set_label('Noise = 10.0') p424[1].set_xlabel('Number of Iterations') p424[1].legend(loc='upper left', fontsize='x-small') f4.text(0.04, 0.5, 'Initial x-distance from Converged minimum', va='center', rotation='vertical') plt.suptitle( 'Rosemary Teague, performance \n Time taken for values to converge', fontsize='large') plt.tight_layout(pad=3.5, h_pad=1, w_pad=1) plt.savefig('hw244', dpi=700)
def bracket_descent_test(xg, display=False, compare=False, i=1): """ ====================================================================================== Use the Bracket Descent method to minimize a cost function, j, defined in cost module. ====================================================================================== Parameters ---------- xg : list Initial guess display : Boolean, Optional If set to True, figures will be created to illustrate the optimization path taken and the distance from convergence at each step. compare : Boolean, optional If set to True, a figure will be created to directly compare Newton and Bracket Descent methods. i=1 : Integer, Optional Sets the name of the figures as hw231(/2)_i.png. Returns --------- xf : ndarray Computed location of minimum jf : float Computed minimum output : Tuple containing the time taken for the minimia to be found for each of newton and bracket descent methods. An average over 10 tests is taken, only set if compare parameter set to True, otherwise empty. Calling this function will produce two figures. The first will containing two subplots illustrating the location of each step in the minimization path, overlayed over the initial cost function, and the distance of j from the final, computed minimum at each iteration. The second plot (which is only produced when 'compare' is set to True) demonstrates the distance of each step from the final, converged minimum at each iteration. This shows that the newton method requires significantly fewer steps and is hence faster. Trends Observed ---------------- Figures hw321_i show the path taken during a bracket descent conversion is much longer than in a newton conversion (shown in figures hw22i). This is because the B-D method limits the size of a step to 2*L where L is definied by the size of an equilateral triangle whose centroid moved with each step. The method is furthermore designed such that this triangle will only decrease in size per iteration, and hence the maximum length a step can take can only be decreased (not increased) throughout the convergence. The figures further show that steps appear to be taken initially perpendicular to the curvature, finding the minimum along that strip, and then converging in down the parallel Path until they reach a level of tolerance. In contrast, the Newton approach is not limited in the size of the steps it is able to take and can hence converge in a much smaller number of iterations. This is a result of the use of gradients in this method. Figures hw22i illustrate how each step travels through many bands on the contour plot (representing differences of 1 order of magnitude each) as the method searches for the direction of minimisation. """ cost.c_noise = False hw2.tol = 10**(-6) hw2.itermax = 1000 t34 = 0 output = () if compare: N = 10 else: N = 1 for j in range(1, N): t3 = time() hw2.bracket_descent(xg) t4 = time() t34 = t34 + (t4 - t3) X, Y = hw2.xpath xf = [X[-1], Y[-1]] jf = hw2.jpath[-1] d1 = np.sqrt((X - xf[0])**2 + (Y - xf[1])**2) if display: Minx = min(X) - 1 Maxx = max(X) + 1 Miny = min(Y) - 1 Maxy = max(Y) + 1 [Xj, Yj] = np.linspace(Minx, Maxx, 200), np.linspace(Miny, Maxy, 200) #calculate noiseless cost function at each point on 2D grid j = [[cost.costj([xi, yi]) for xi in Xj] for yi in Yj] f, (p1, p2) = plt.subplots(1, 2) p1.contourf(Xj, Yj, j, locator=ticker.LogLocator(), cmap=cm.GnBu) p1.plot(X, Y, 'g', marker='d') p1.set_xlabel('X1-location') p1.set_ylabel('X2-location') p1.set_title('Convergence Path') p2.semilogy(np.linspace(1, len(X), len(X)), hw2.jpath) p2.set_xlabel('Iteration number') p2.set_ylabel('distance from converged minimum') p2.set_title('Rate') plt.suptitle('Rosemary Teague, bracket_descent_test, initial guess =' + str(xg) + ' \n Rate of convergence of a cost function') plt.tight_layout(pad=4) plt.savefig('hw231_' + str(i), dpi=700) if compare: plt.close('all') One, = plt.loglog(np.linspace(1, len(X), len(X)), hw2.jpath) xf2, jf2, outputn = newton_test(xg, timing=True) X2, Y2 = outputn[1], outputn[2] d2 = np.sqrt((X2 - xf2[0])**2 + (Y2 - xf2[1])**2) print(np.linspace(1, len(X2), len(X2)), outputn[3]) Two, = plt.loglog(np.linspace(1, len(X2), len(X2)), outputn[3]) One.set_label('Bracket Descent') Two.set_label('Newton') plt.xlabel('Iteration number') plt.ylabel('Distance from converged minimum') plt.legend() plt.title('Rosemary Teague, bracket_descent_test, initial guess =' + str(xg) + ' \n Comparison of Newton and Bracket Descent Methods') plt.savefig('hw232_' + str(i), dpi=700) output = (outputn[0], t34 / N) return xf, jf, output
def performance(noise=False): """ Assess performance of B-D and L-BFGS-B methods. Add input/output as needed (i) NO-NOISE: On the first figure we can see a barplot of number of iterations it takes for the l-bfgs-l and bracket descent to converge for different starting points. The clear trend here is that the further from [1,1] we are the more iterations it takes to converge for both algorithms. We can also see that bracket descent takes twice as more iterations when starting at [-1,-3] and the gap grows as we decrease the x coordinate of the xguess. In the above questions we have investigated more deeply how bracket descent works and its not surprising that it takes a high number of iterations. L-bfgs-l is a quasi newton method and approximates the hessian so acts kindof like the newton method in that it takes less steps. However it still takes more steps than Newton as it only approximates the hessian and does not really have it. On the second figure we can see the final values that the cost function takes at the points where the two algorithms have converged. We immediately see that the final cost values are a lot lower (by a factor of 1e6) for the l-bfgs-l than for bracket descent. This is the second performance edge of l-bfgs-l over bracket_descent. It converged to a lot lower. However, the final value of the bracket descent is still very small, and close to zero. The huge difference in the cost values of l-bfgs-l and BD are due to the fact that near the minimum [1,1] the function changes dramatically in the marginal values (going to 0 at [1,1]). Still the overall trend we see is that l-bfgs-l converges to a better minimum. On the third graph we can see the averge time it takes for the l-bfgs-l and bracket_descent to converge to the minimum. The average is taken over 10 consecutive runs. The time is given in microseconds (second * 1e-6) so we can see that both algorithms have very fast times. However, the bracket descent is significantly faster compared to l-bfgs-l. In fact, bracket descent is 34 times faster starting from the closest point and the margin increases and for the furthest point bracket descent is 40 times faster. This is an interesting performance gain and edge of bracket descent over l-bfgs-l. The reason why bracket descent is so much faster is that it does a lot of very small and not very heavy on memory computations. The code is written in fortran so the loops are properly optimized and the only operations bracket descent is doing is multiplication and division and a bunch of if statements. Such computations are a lot quicker than approximating the functions hessian and solving linear equations, which is done by l-bfgs-l. Overall, we can see that in terms of time bracket descent is a lot faster but needs more iterations to converge and converges to weaker minimums. Also, bracket descent does not converge for some values (try [40,240]). (ii) NOISE: On the first graph we can see the final values of the cost function for the two optimizers when the function has had noise=100 introduced. We can see that now the sides flip (in relation to the case with no noise), because brackt descent seems to converge to a lower minimum or similar than l-bfgs-l. This could be the effect of the fact that l-bfgs-l, because its using a approximation of the hessien, has its steps accurately calculated, and is somewhat thrown off when it lands on a spot where noise was added. On the other hand bracket descent simply follows the lowering cost and doesnt care and think forward about its walk. On the second graph we can see that again the further the starting point from [1,1] the longer it takes to converge (logical). We can see again that bracket descent is significantly faster than l-bfgs-l but now the margin is smaller. Now bracket descent is 6 times faster for the closest point and 10 times faster for the furthest, which comparing to the values without noise on cost is a significant reduction in speed edge. We can see that adding cost has slowed bracket descent more than it slowed l-bfgs-l. This might be the effect of ravines and tunnels created by the noise, through which the bracket descent had to manouvre through and since it takes more small steps than l-bfgs-l it was more affected by that. (iii) IMPROVEMENTS: One obvious improvement we could add to our code is we could generalize it to work on different cost functions rather than just working on the cost provided from the cost.f90 module. This should not be too hard to implement we would just have to provide the function as a input. Then this work would have a lot more usability, because someone could for example optimize a function which eh wishes to optimize using our module. Then using our python code he could compare his results using the newton , bracket descent and l-bfgs-l methods and pick one which suits him best. For that sake we could modify our above code so that the user would have the choice of the initial guess, and the graphs and figures would produce themselves accordingly. Another improvement we could make is we could add more optimize to which we would compare newton and bracket descent. This way a scientifically literate person could find a comrehensive review of the distinct features that different optimization gives him and pick the one that works best for him. """ if noise is False: cost.c_noise = False #initialize matrix of points xg_mat = [[-1,-3], [-10,-3], [-50,-3], [-100,-3]] #all necessary declarations nit_bd = np.zeros([4]) nit_lbfgs = np.zeros([4]) idx = 0 bracket_costs = np.zeros([4]) lbfgs_costs = np.zeros([4]) #create figures fig1 = plt.figure(figsize=(7,6)) fig2 = plt.figure(figsize=(7,6)) fig3 = plt.figure(figsize=(7,6)) ax = fig1.add_subplot(111) ax2 = fig3.add_subplot(111) ax3 = fig2.add_subplot(211) ax4 = fig2.add_subplot(212) #do time average lbfgs_av_times = np.zeros([4]) bracket_av_times = np.zeros([4]) for i in range(10): bracket_times = np.zeros([4]) lbfgs_times = np.zeros([4]) k = 0 for points in xg_mat: st_lbfgs = time.time() lbfgs = scopt.minimize(cost.costj, points, method='L-BFGS-B', options={'gtol': 1e-6, 'maxiter': 1000}, ) lbfgs_times[k] = 1e6 * (time.time() - st_lbfgs) st_bracket = time.time() bd = hw2.bracket_descent(points) bracket_times[k] = 1e6 * (time.time() - st_bracket) k += 1 lbfgs_av_times += lbfgs_times bracket_av_times += bracket_times lbfgs_av_times = lbfgs_av_times/10 bracket_av_times = bracket_av_times/10 #do other graphs for points in xg_mat: lbfgs = scopt.minimize(cost.costj, points, method='L-BFGS-B', options={'gtol': 1e-6, 'maxiter': 1000}, ) bd = hw2.bracket_descent(points) #number_of_iterations dummy1 = len(hw2.xpath) nit_lbfgs[idx] = lbfgs.nit nit_bd[idx] = dummy1 #final costs bracket_costs[idx] = bd[1] lbfgs_costs[idx] = lbfgs.fun idx+=1 #plot of number of iterations x = list(range(1,5)) ax.bar(np.array(x)-0.1, nit_bd, width=0.2, color='b', align='center', label='bracket_descent') ax.bar(np.array(x)+0.1, nit_lbfgs, width=0.2, color='r', align='center', label='l-bfgs-b') ax.set_xticks(x) ax.set_xticklabels(xg_mat) ax.set_xlabel("Initial xguess") ax.set_ylabel("Number of algorithm iterations it takes to converge") ax.legend() ax.set_title("Bar plot showing the number of iterations \n until convergence for bracket descent and l-bfgs-b. \n Cmd. performance(), Igor Adamski") #plot of times ax2.bar(np.array(x)-0.1, bracket_av_times, width=0.2, color='b', align='center', label='bracket_descent') ax2.bar(np.array(x)+0.1, lbfgs_av_times, width=0.2, color='r', align='center', label='l-bfgs-b') ax2.set_xticks(x) ax2.set_xticklabels(xg_mat) ax2.set_xlabel("Initial xguess") ax2.set_ylabel("Average time it takes for the algorithm to converge ($\mu$s)") ax2.legend() ax2.set_title("Bar plot showing the average time it takes \n bracket descent and l-bfgs-b to converge (mean of 10 runs). \n Cmd. performance(), Igor Adamski") for i, v in enumerate(bracket_av_times): ax2.text(i+0.79, v+0.01, "{0:.1f}".format(v), color='black') for i,v in enumerate(lbfgs_av_times): ax2.text(i+0.99, v+0.01, "{0:.1f}".format(v), color='black') #plot of final costs ax3.bar(np.array(x), bracket_costs, width=0.2, color='b', align='center', label='bracket_descent') ax3.set_xticks(x) ax3.set_xticklabels(xg_mat) ax3.ticklabel_format(style='sci', axis='y', scilimits=(0,0)) ax4.bar(np.array(x), lbfgs_costs, width=0.2, color='r', align='center', label='l-bfgs-b') ax3.set_ylabel('Value of cost') ax4.set_xlabel('Different starting xguess') ax4.set_xticks(x) ax4.set_xticklabels(xg_mat) ax4.ticklabel_format(style='sci', axis='y', scilimits=(0,0)) ax4.set_ylabel('Value of cost') ax3.legend() ax4.legend() fig2.suptitle('Final values of the cost function after convergence \n for bracket descent and l-bfgs-l. \n Cmd. performance(), Igor Adamski') elif noise is True: cost.c_noise = True cost.c_noise_amp = 100. xg_mat = [[-1,-3], [-10,-3], [-50,-3], [-100,-3]] #all the same initializations as above but with noise nit_bd = np.zeros([4]) nit_lbfgs = np.zeros([4]) xpath_lbfgs = {} idx = 0 bracket_costs = np.zeros([4]) lbfgs_costs = np.zeros([4]) fig2 = plt.figure(figsize=(7,6)) fig3 = plt.figure(figsize=(7,6)) ax2 = fig3.add_subplot(111) ax3 = fig2.add_subplot(211) ax4 = fig2.add_subplot(212) #do time average lbfgs_av_times = np.zeros([4]) bracket_av_times = np.zeros([4]) for i in range(10): bracket_times = np.zeros([4]) lbfgs_times = np.zeros([4]) k = 0 for points in xg_mat: st_lbfgs = time.time() lbfgs = scopt.minimize(cost.costj, points, method='L-BFGS-B', options={'gtol': 1e-6, 'maxiter': 1000}, ) lbfgs_times[k] = 1e6 * (time.time() - st_lbfgs) st_bracket = time.time() bd = hw2.bracket_descent(points) bracket_times[k] = 1e6 * (time.time() - st_bracket) k += 1 lbfgs_av_times += lbfgs_times bracket_av_times += bracket_times lbfgs_av_times = lbfgs_av_times/10 bracket_av_times = bracket_av_times/10 #do other graphs for points in xg_mat: lbfgs = scopt.minimize(cost.costj, points, method='L-BFGS-B', options={'gtol': 1e-6, 'maxiter': 1000}, ) bd = hw2.bracket_descent(points) #final costs bracket_costs[idx] = bd[1] lbfgs_costs[idx] = lbfgs.fun idx+=1 #plot of times x = list(range(1,5)) ax2.bar(np.array(x)-0.1, bracket_av_times, width=0.2, color='b', align='center', label='bracket_descent') ax2.bar(np.array(x)+0.1, lbfgs_av_times, width=0.2, color='r', align='center', label='l-bfgs-b') ax2.set_xticks(x) ax2.set_xticklabels(xg_mat) ax2.set_xlabel("Initial xguess") ax2.set_ylabel("Average time it takes for the algorithm to converge ($\mu$s)") ax2.legend() ax2.set_title("Bar plot showing the average time it takes bracket descent \n and l-bfgs-b to converge (mean of 10 runs) for cost with noise 100. \n Cmd. performance(True), Igor Adamski") for i, v in enumerate(bracket_av_times): ax2.text(i+0.79, v+0.01, "{0:.1f}".format(v), color='black') for i,v in enumerate(lbfgs_av_times): ax2.text(i+0.99, v+0.01, "{0:.1f}".format(v), color='black') #plot of final costs ax3.bar(np.array(x), bracket_costs, width=0.2, color='b', align='center', label='bracket_descent') ax3.set_xticks(x) ax3.set_xticklabels(xg_mat) ax3.ticklabel_format(style='sci', axis='y', scilimits=(0,0)) ax3.axhline(0, color="black", linewidth=0.9) ax3.set_ylabel('Value of cost') ax4.bar(np.array(x), lbfgs_costs, width=0.2, color='r', align='center', label='l-bfgs-b') ax4.set_xticks(x) ax4.set_xticklabels(xg_mat) ax4.ticklabel_format(style='sci', axis='y', scilimits=(0,0)) ax4.axhline(0, color="black",linewidth=0.9) ax4.set_ylabel('Value of cost') ax4.set_xlabel('Initial xguess') ax3.legend() ax4.legend() fig2.suptitle('Final values of the cost function after convergence \n for bracket descent and l-bfgs-l with added noise of 100. \n Cmd. performance(True), Igor Adamski')
def bracket_descent_test(xg,display=False): """ Use bracket-descent to minimize cost function defined in cost module Input variable xg is initial guess for location of minimum. When display is true, 1-2 figures comparing the B-D and Newton steps should be generated Output variables: xf -- computed location of minimum, jf -- computed minimum On the first figure we can see the paths that the Newton and bracket descent algorithms take to finally converge to the minimum. From that figure we can see that Newton tends to do very little steps (at most 6-7) and the steps that it takes are big. By big, I mean that Newton takes huge steps in directions that are far from the final converged minimum but always manages to return back. On the other hand bracket descent does lots of very small steps. The bracket descent steps are small but ultimately lead it to the minimum. We can also see that bracket descent doesnt do jumps in opposite directions to which its supposed to go in contrast to Newton. On the second figure we can see the normed distance between the current point and the true minimum at [1,1] plotted against the iterations. We can see that Newton at first jumps very far from the actual minimum, only to 2 steps later converge to [1,1]. On the other hand, bracket descent tends to stay on the trajectory going down except a few times when it goes further from [1,1] for a bit only to return promptly. Overall, its visible that newton takes a lot less (a factor of hundreds) iterations to converge than bracket descent. The behaviour of Newton is not surprising in terms of the number of iterations, as it relies on the gradient and hessian to guide it the right way. Gradient and hessian give the information about the curvature of the cost surface and this makes newton always follow the right track. ON the other hand, bracket descent does not possess such high level information about the cost function and it simply iterates and finds smaller values of the cost function and proceeds to take little steps adjusting the triangles vertices. Bracket descent takes small steps because by construction its only allowed to make changes to the vertices of at most 4 heights of the traingle. Newton takes big steps because the gradient and hessian can lead him far away only to return to the minimum in the next step. """ #call bracket descent and extract paths cost.c_noise = False bd = hw2.bracket_descent(xg) xf = bd[0] jf = bd[1] xpath_bd = hw2.xpath.copy() #computed normed distance to [1,1] norm_distance_bd = np.linalg.norm((xpath_bd - [1,1]), axis=1) #purely esthetics of plot if np.linalg.norm(xg)<10: scale="linear" else: scale="symlog" #plot if display is True: fig1 = plt.figure() fig2 = plt.figure() nw = hw2.newton(xg) xpath_nw = hw2.xpath norm_distance_nw = np.linalg.norm((xpath_nw - [1,1]), axis=1) ax = fig1.add_subplot(111) ax.plot(xpath_bd[:,0], xpath_bd[:,1], 'r', label='Bracket descent', linewidth = 1) ax.scatter(xpath_bd[:,0], xpath_bd[:,1], s=10, c='black') ax.scatter(xg[0], xg[1], c='green', label='Initial guess') ax.plot(xpath_nw[:,0], xpath_nw[:,1], 'b', label='Newton') #ax.plot(xf[0], xf[1], #'c*', label='Final convergence of bracket_descent') ax.scatter(nw[0][0], nw[0][1], c='orange', marker='D', label='Actual minimum') ax.set_yscale(scale) ax.set_ylabel('y-coordinate') ax.set_xlabel('x-coordinate') ax.set_title('The paths of bracket descent and Newton \n starting from xguess={}. \n Cmd. bracket_descent_test({},True), Igor Adamski'.format(xg, xg)) ax.legend() ax2 = fig2.add_subplot(111) ax2.plot(norm_distance_nw, label='Newton') ax2.plot(norm_distance_bd, label='Bracket descent') ax2.set_xlabel('Iterations') ax2.set_ylabel('Norm distance from [1,1]') ax2.set_yscale(scale) ax2.set_title('Norm distance from minimum at [1,1] against \n the number of iterations. \n Cmd. bracket_descent_test({},True). Igor Adamski'.format(xg)) ax2.legend() plt.show() return xf, jf
def performance(): """ Assess performance of B-D and L-BFGS-B methods. Add input/output as needed Discussion and assessment: hw241.png shows the error of the final result for both bracket descent and L-BFGS-B for a varied selection of points. For Bracket descent we see a range of errors from 0.0030 to 0.00025, however, for L-BFGS-B we see much lower errors (discussed below). hw242.png shows the number of iterations required before convergance for both of the methods for a variety of points. We once again see that bracket descent requires a higher number of iterations for every point evaluated (roughly double the number of iterations as L-BFGS-B requires.) This is because "L-BFGS uses an estimation to the inverse Hessian matrix to steer its search through variable space" (Scipy user manual) in a similar way to the newton method used earlier. This allows for it to jump larger steps in the direction of the minimum. Interestingly, hw243.png shows that for all the points L-BFGS-B had a larger clock time, approximately 0.0012 to 0.0035, (compared to a much lower time of bracket descent.) This is due to the estimation of the inverse hessian that must be computed at every step of he L-BFGS-B algorithm, whilst bracket descent relies on a series of simple if statments. However, it is worth noting that Bracket descent has a default module tolerance of 1e-6. L-BFGS-B by default has a tolerance of 2.2e-9. This will explain part of the reason for the slower nature of L-BFGS-B, and also the much higher precision, along with the reasons above. hw244/5.png shows the effect that amplitude has on both algorithms. We see that for a starting value of [-1,-10], amplitude seems to have a much larger effect on Bracket descent. This will be due to the fact that L-BFGS-B will implement costj once per loop, whilst bracket descent will implement costj 3 time, one for each vertex of the triangle. Describe 3 distinct features that could be (or have been) added to your Python code to improve its suitability for use as scientific software: 1) An easy fix would be to introduce the option for the user to have access to other cost functions (Mean square error e.c.t) 2) More optimization algorithms contained in the fortran module would give the python user a greater degree of freedom. Also greating a GUI would make the program more accessable to scientists. 3) Paralellisation, - using the OpenMp techniques described in the lectures we could parallelize some of the fortran code. (4 extra - Make the tool an online resource. This way users do not need to download f2py, gfortan the exact same compiler, python ect ect. They can just run it online (to create this make a CGI.) Have a nice day Dr. Ray. """ x1_vals = [-100, -50, -10, -1] x2 = -3 BFGS_no_iterations = [ ] #Number of iterations before convergance for L-BFGS-B BFGS_errors = [] #Error for L-BFGS-B Bracket_errors = [] #Error for bracket descent Bracket_no_iterations = [ ] #Number of iterations before conv for bracket descent Bracket_clocktimes = [] BFGS_clocktimes = [] #Generate data: for i in range(len(x1_vals)): #set the point that we will evaluate xg = (x1_vals[i], x2) #perform both optimization methods t1 = time.time() #start timer 1 tp1 = time.process_time() #start timer 2 #RUN THE ALGORITHM data_bracket = hw2.bracket_descent(xg) t2 = time.time() #t2-t1 gives wallclock time tp2 = time.process_time( ) #tp2-tp1 gives cpu time -- depends on number of cores! Bracket_clocktimes.append(t2 - t1) t1 = time.time() #start timer 1 tp1 = time.process_time() #start timer 2 #RUN THE ALGORITHM data_LBFGSB = minimize(cost.costj, xg, method='L-BFGS-B') t2 = time.time() #t2-t1 gives wallclock time tp2 = time.process_time( ) #tp2-tp1 gives cpu time -- depends on number of cores! BFGS_clocktimes.append(t2 - t1) #Add the errors to the lists: temp_bracket = [1, 1] - data_bracket[0] Bracket_errors.append(np.sqrt(temp_bracket[0]**2 + temp_bracket[1]**2)) temp_bfgs = [1, 1] - data_LBFGSB.x BFGS_errors.append(np.sqrt(temp_bfgs[0]**2 + temp_bfgs[1]**2)) #Add the number of iterations required till convergance for both: Bracket_no_iterations.append(len(hw2.jpath)) BFGS_no_iterations.append(data_LBFGSB.nit) #Plot (using pyplot plt.bar online template) #BOX PLOT FOR BRACKET ERRORS fig, ax = plt.subplots() plt.suptitle('Lawrence Stewart - Created Using performance().') index = np.arange(4) bar_width = 0.35 opacity = 0.7 rects1 = plt.bar(index, tuple(Bracket_errors), bar_width, alpha=opacity, color='c', label='Bracket Descent') rects2 = plt.bar(index + bar_width, tuple(BFGS_errors), bar_width, alpha=opacity, color='m', label='L-BFGS-B') plt.xticks(index + bar_width, ('(-100, -3)', '(-50,-3)', '(-10, -3)', '(-1,-3)')) plt.legend() ax = plt.gca() ax.set_facecolor('#D9E6E8') fig.tight_layout(rect=[0, 0.03, 1, 0.95]) plt.title("Errors Plots for varied starting points ") plt.ylabel("Error") plt.show() #BOX PLOT FOR NO OF ITERATIONS fig, ax = plt.subplots() plt.suptitle('Lawrence Stewart - Created Using performance().') index = np.arange(4) bar_width = 0.35 opacity = 0.7 rects1 = plt.bar(index, tuple(Bracket_no_iterations), bar_width, alpha=opacity, color='c', label='Bracket Descent') rects2 = plt.bar(index + bar_width, tuple(BFGS_no_iterations), bar_width, alpha=opacity, color='m', label='L-BFGS-B') plt.xticks(index + bar_width, ('(-100, -3)', '(-50,-3)', '(-10, -3)', '(-1,-3)')) plt.legend() ax = plt.gca() ax.set_facecolor('#D9E6E8') fig.tight_layout(rect=[0, 0.03, 1, 0.95]) plt.title("Number of iterations before convergance for varied points ") plt.ylabel("Number of iterations") plt.show() #BOX PLOT FOR CLOCKTIMES fig, ax = plt.subplots() plt.suptitle('Lawrence Stewart - Created Using performance().') index = np.arange(4) bar_width = 0.35 opacity = 0.7 rects1 = plt.bar(index, tuple(Bracket_clocktimes), bar_width, alpha=opacity, color='c', label='Bracket Descent') rects2 = plt.bar(index + bar_width, tuple(BFGS_clocktimes), bar_width, alpha=opacity, color='m', label='L-BFGS-B') plt.xticks(index + bar_width, ('(-100, -3)', '(-50,-3)', '(-10, -3)', '(-1,-3)')) plt.legend() ax = plt.gca() ax.set_facecolor('#D9E6E8') fig.tight_layout(rect=[0, 0.03, 1, 0.95]) plt.title("Clocktimes for a selection of starting points") plt.ylabel("Time (s)") plt.show() #---------------ANALYSE NOISE--------------- #Analyse Effects of Noise at point xg: xg = [-1, -10] #Generate the different noise values - [0,100] noise_vals = np.arange(50 + 1) #Allocate the lists for storing the errors and number of iterations before convergance # noise_Bracket_errors=[] # noise_BFGS_errors=[] noise_Bracket_iterations = [] noise_BFGS_iterations = [] bd_wallclocks = [] LBFGSB_wallclocks = [] #Run the algorithms for specified noise value for i in range(len(noise_vals)): #Set noise cost.c_noise = True cost.c_noise_amp = noise_vals[i] #print(cost.c_noise_amp) #Run algorithms #Time for bracket descent: t1 = time.time() #start timer 1 tp1 = time.process_time() #start timer 2 #RUN THE ALGORITHM data_bracket = hw2.bracket_descent(xg) t2 = time.time() #t2-t1 gives wallclock time tp2 = time.process_time( ) #tp2-tp1 gives cpu time -- depends on number of cores! bd_wallclock = t2 - t1 bd_cpu_time = tp2 - tp1 bd_wallclocks.append(bd_wallclock) #Time for bracket descent: t1 = time.time() #start timer 1 tp1 = time.process_time() #start timer 2 #RUN THE ALGORITHM data_LBFGSB = minimize(cost.costj, xg, method='L-BFGS-B') t2 = time.time() #t2-t1 gives wallclock time tp2 = time.process_time( ) #tp2-tp1 gives cpu time -- depends on number of cores! LBFGSB_wallclock = t2 - t1 LBFGSB_cpu_time = tp2 - tp1 LBFGSB_wallclocks.append(LBFGSB_wallclock) # #Append the error for each method for the selected noise value # noise_temp_bracket=[1,1]-data_bracket[0] # noise_Bracket_errors.append(np.sqrt(noise_temp_bracket[0]**2+noise_temp_bracket[1]**2)) # noise_temp_bfgs=[1,1]-data_LBFGSB.x # noise_BFGS_errors.append(np.sqrt(noise_temp_bfgs[0]**2+noise_temp_bfgs[1]**2)) #Add the number of iterations required to converge to the lists noise_Bracket_iterations.append(len(hw2.jpath)) noise_BFGS_iterations.append(data_LBFGSB.nit) #Reset the noise back to default cost.c_noise_amp = 0.0 plt.figure(figsize=(14, 6)) plt.suptitle('Lawrence Stewart - Created Using performance().') plt.subplot(121) plt.plot(noise_vals, LBFGSB_wallclocks, label="L-BFGS-B", alpha=0.7, color='r') plt.plot(noise_vals, bd_wallclocks, label="Bracket Descent", alpha=0.7, color='m') ax = plt.gca() ax.set_facecolor('#D9E6E8') plt.xlabel("Noise Amplitude") plt.title("Wallclock times at varied noise amplitudes with xg= %s" % (xg)) plt.ylabel("Time (s)") plt.grid("on") plt.legend() # plt.show() plt.subplot(122) plt.plot(noise_vals, noise_Bracket_iterations, label='L-BFGS-B', alpha=0.7, color='r') plt.plot(noise_vals, noise_BFGS_iterations, label='Bracket Descent', alpha=0.7, color='m') plt.title("Number of Iterations As Amplitude Changes, xg=%s" % xg) plt.ylabel("Number of Iterations") plt.xlabel("Amplitude") ax = plt.gca() ax.set_facecolor('#D9E6E8') plt.grid("on") plt.legend() plt.show() #OLD ONE # # plt.figure(figsize=(14,6)) # plt.suptitle('Lawrence Stewart - Created Using nejhgughst().') # plt.subplot(121) # plt.plot(noise_vals,noise_BFGS_errors,alpha = 0.7, label = 'L-BFGS-B') # plt.plot(noise_vals,noise_Bracket_errors, alpha = 0.7, label = 'Bracket Descent') # plt.title("Error of Minimum for varied Amplitude of noise , xGuess = (-1,-10)") # plt.ylabel("Error") # plt.xlabel("Amplitude") # ax = plt.gca() # ax.set_facecolor('#D9E6E8') # plt.legend() return None
def bracket_descent_test(xg, display=False): """ Use bracket-descent to minimize cost function defined in cost module Input variable xg is initial guess for location of minimum. When display is true, 1-2 figures comparing the B-D and Newton steps should be generated Output variables: xf -- computed location of minimum, jf -- computed minimum Discussion and Explanation: Bracket descent and newtons test operate in very different fashions as seen in the figures attatched. hw231.png shows the step size, h, that is taken at each iteration of both the algorithms. Distance is defined with the usual euclidean norm, and one can see that Newtons method takes initally very large steps, approximately 10000 (for the starting point [100,10]). The step size drastically decreases as the algorithm converges upon the minimum. The reason for such a high initial step size is the gradient based framework that Newtons method operates from, allowing it to initially move in large steps towards the minimum. Bracket Descent remains approximately constant in the step size, which is to be expected due to the triangular method of descent that the algorithm utilizes, (the algorithm is bounded in the step size it can do). h232.png shows the wallclock and CPU time for both of the methods. Due to a faster convergance, Newtons method terminates after a shorter duration, for both CPU and Wallclock time; approximately 0.00001 wallclock and roughly the same for CPU. Bracket descent takes longer to converge, with approximately 0.00005 for wallclock and CPU time. If we were to take points that were further away from the global minimum of [1,1], we would see this result extrapolated, due to the constant nature of the stepsize of Bracket Descent. The size of newton's tests intial movements would increase with a further away starting point, and the time would remain small. """ if display == False: xf, jf = hw2.bracket_descent(xg) if display == True: average_bd_wallclock = 0 average_newton_wallclock = 0 average_newton_cpu_time = 0 average_bd_cpu_time = 0 #Time over an average: for i in range(20): t1 = time.time() #start timer 1 tp1 = time.process_time() #start timer 2 #Run bracket descent xf, jf = hw2.bracket_descent(xg) t2 = time.time() #t2-t1 gives wallclock time tp2 = time.process_time( ) #tp2-tp1 gives cpu time -- depends on number of cores! bd_wallclock = t2 - t1 bd_cpu_time = tp2 - tp1 average_bd_wallclock += bd_wallclock average_bd_cpu_time += bd_cpu_time xpath = hw2.xpath bracket_steps = [ xpath[i + 1] - xpath[i] for i in range(len(xpath) - 1) ] newton_steps = [ hw2.newtonstep(xpath[i]) for i in range(len(xpath) - 1) ] bracket_steps_dist = [ np.sqrt(bracket_steps[i][0]**2 + bracket_steps[i][1]**2) for i in range(len(bracket_steps)) ] newton_steps_dist = [ np.sqrt(newton_steps[i][0]**2 + newton_steps[i][1]**2) for i in range(len(newton_steps)) ] steps = np.arange(1, len(bracket_steps) + 1, 1) ratio_steps = [] for i in range(len(bracket_steps_dist)): ratio_steps.append(newton_steps_dist[i] / bracket_steps_dist[i]) #Run newton for timing as well #Time for i in range(20): t1 = time.time() #start timer 1 tp1 = time.process_time() #start timer 2 #newtons hw2.newton(xg) t2 = time.time() #t2-t1 gives wallclock time tp2 = time.process_time( ) #tp2-tp1 gives cpu time -- depends on number of cores! newton_wallclock = t2 - t1 newton_cpu_time = tp2 - tp1 average_newton_wallclock += newton_wallclock average_newton_cpu_time += newton_cpu_time #divide by 20 to create the averages average_newton_cpu_time = average_newton_cpu_time / 20 average_newton_wallclock = average_newton_wallclock / 20 average_bd_wallclock = average_bd_wallclock / 20 average_bd_cpu_time = average_bd_cpu_time / 20 plt.figure() # plt.subplot(121) plt.title( "Step (h) Comparison for Newtons and Bracket Descent with xg=%s" % xg) plt.suptitle( 'Lawrence Stewart - Created Using bracket_descent_test().') plt.xlabel('Iteration number') plt.ylabel('Size of step h') plt.plot(steps, bracket_steps_dist, label="Bracket Descent", alpha=0.8, color='r') plt.plot(steps, newton_steps_dist, label="Newtons Method", alpha=0.7) ax = plt.gca() plt.grid('on') ax.set_facecolor('#D9E6E8') plt.legend() # plt.subplot(122) # plt.plot(steps,ratio_steps,alpha=0.7,color='r') # plt.title("Ratio of Newton Step taken over Bracket Step taken at Each Iteration") # plt.xlabel("Iteration") # plt.ylabel("Newton Step/ Bracket Step") # plt.grid('on') # ax = plt.gca() # ax.set_facecolor('#D9E6E8') # plt.show() #Plot timings: fig, ax = plt.subplots() plt.suptitle( 'Lawrence Stewart - Created Using bracket_descent_test().') index = np.arange(2) bar_width = 0.35 opacity = 0.7 rects1 = plt.bar(index, (average_newton_wallclock, average_newton_cpu_time), bar_width, alpha=opacity, color='c', label='Newton') rects2 = plt.bar(index + bar_width, (average_bd_wallclock, average_bd_cpu_time), bar_width, alpha=opacity, color='m', label='BD') plt.xticks(index + bar_width, ('Wallclock Time', 'CPU Time')) plt.legend() ax = plt.gca() ax.set_facecolor('#D9E6E8') fig.tight_layout(rect=[0, 0.03, 1, 0.95]) plt.title("Average Timings for Bracket Descent and Newtons for xg=%s" % xg) plt.ylabel("Time (s)") plt.show() return xf, jf