return distance_matrix def arc_distance_numpy_broadcast(a, b): """ Calculates the pairwise arc distance between all points in vector a and b. """ theta_1 = a[:, 0][:, None] theta_2 = b[:, 0][None, :] phi_1 = a[:, 1][:, None] phi_2 = b[:, 1][None, :] temp = (np.sin((theta_2 - theta_1) / 2)**2 + np.cos(theta_1) * np.cos(theta_2) * np.sin((phi_2 - phi_1) / 2)**2) distance_matrix = 2 * (np.arctan2(np.sqrt(temp), np.sqrt(1 - temp))) return distance_matrix from compare_perf import compare_perf n = 1000 import numpy as np a = np.random.rand(n, 2) b = np.random.rand(n, 2) compare_perf(arc_distance_python_nested_for_loops, [a, b]) compare_perf(arc_distance_numpy_broadcast, [a, b]) compare_perf(arc_distance_numpy_tile, [a, b])
def harris(I): m,n = I.shape dx = (I[1:, :] - I[:m-1, :])[:, 1:] dy = (I[:, 1:] - I[:, :n-1])[1:, :] # # At each point we build a matrix # of derivative products # M = # | A = dx^2 C = dx * dy | # | C = dy * dx B = dy * dy | # # and the score at that point is: # det(M) - k*trace(M)^2 # A = dx * dx B = dy * dy C = dx * dy tr = A + B det = A * B - C * C k = 0.05 return det - k * tr * tr from compare_perf import compare_perf m,n = 1920, 1080 dtype = 'uint8' I = np.random.randn(m,n).astype(dtype) compare_perf(harris, [I], propagate_exceptions=True, backends = ('openmp',))
""" Accepted response on stack overflow by phillip """ gg = np.outer(g, g) gggg = np.outer(gg, gg).reshape(4 * g.shape) axes = ((0, 2, 4, 6), (0, 1, 2, 3)) return np.tensordot(gggg, T, axes) T = np.random.randn(n, n, n, n) g = np.random.randn(n, n) from compare_perf import compare_perf compare_perf(rotT_loops, [T, g], extra={'numpy_tensordot': rotT_numpy}, numba=False, backends=('c', 'openmp'), cpython=True) def rotT_par(T, g): def compute_elt(i, j, k, l): total = 0.0 for ii in range(n): for jj in range(n): for kk in range(n): for ll in range(n): gg = g[ii, i] * g[jj, j] * g[kk, k] * g[ll, l] total += gg * T[ii, jj, kk, ll] return total
def rosen_der_np(x): der = np.empty_like(x) der[1:-1] = (+200 * (x[1:-1] - x[:-2]**2) - 400 * (x[2:] - x[1:-1]**2) * x[1:-1] - 2 * (1 - x[1:-1])) der[0] = -400 * x[0] * (x[1] - x[0]**2) - 2 * (1 - x[0]) der[-1] = 200 * (x[-1] - x[-2]**2) return der def rosen_der_loops(x): n = x.shape[0] der = np.empty_like(x) for i in range(1, n - 1): der[i] = (+200 * (x[i] - x[i - 1]**2) - 400 * (x[i + 1] - x[i]**2) * x[i] - 2 * (1 - x[i])) der[0] = -400 * x[0] * (x[1] - x[0]**2) - 2 * (1 - x[0]) der[-1] = 200 * (x[-1] - x[-2]**2) return der if __name__ == '__main__': N = 10**7 x = np.arange(N) / float(N) jit(rosen_der_np)(x) from compare_perf import compare_perf # numba still crashes on negative indexing compare_perf(rosen_der_np, [x.copy()], numba=False) compare_perf(rosen_der_loops, [x.copy()], numba=False)
winning_colony = state[i, j, 0] defense_strength = state[i, j, 1] for jj in xrange(max(j - window_radius, 0), min(j + window_radius + 1, width)): for ii in xrange(max(i - window_radius, 0), min(i + window_radius + 1, height)): if ii != i or jj != j: d = image[i, j, :] - image[ii, jj, :] s = np.sum(d**2) gval = 1.0 - np.sqrt(s) / np.sqrt(3) attack_strength = gval * state[ii, jj, 1] if attack_strength > defense_strength: defense_strength = attack_strength winning_colony = state[ii, jj, 0] return [winning_colony, defense_strength] return parakeet.imap(attack, (height, width)) from compare_perf import compare_perf import time t = time.time() growcut_python(image, state, state_next, window_radius) t2 = time.time() print "Python time", t2 - t compare_perf(growcut_par, [image, state, window_radius], suppress_output=False, propagate_exceptions=True)
n = 7 def rotT_loops(T, g): Tprime = np.zeros((n,n,n,n)) for i in range(n): for j in range(n): for k in range(n): for l in range(n): for ii in range(n): for jj in range(n): for kk in range(n): for ll in range(n): gg = g[ii,i]*g[jj,j]*g[kk,k]*g[ll,l] Tprime[i,j,k,l] = Tprime[i,j,k,l] + gg*T[ii,jj,kk,ll] return Tprime def rotT_numpy(T, g): """ Accepted response on stack overflow by phillip """ gg = np.outer(g, g) gggg = np.outer(gg, gg).reshape(4 * g.shape) axes = ((0, 2, 4, 6), (0, 1, 2, 3)) return np.tensordot(gggg, T, axes) T = np.random.randn(n,n,n,n) g = np.random.randn(n,n) from compare_perf import compare_perf compare_perf(rotT_loops, [T, g], extra = {'numpy_tensordot': rotT_numpy})
import numpy as np def smooth(x, alpha): s = x.copy() for i in xrange(1, len(x)): s[i] = alpha * x[i] + (1-alpha)*s[i-1] return s n = 10**6 alpha = 0.01 X = np.random.randn(n).astype('float32') from compare_perf import compare_perf compare_perf(smooth, [X, alpha])
def rosen_der_np(x): der = np.empty_like(x) der[1:-1] = (+ 200 * (x[1:-1] - x[:-2] ** 2) - 400 * (x[2:] - x[1:-1] ** 2) * x[1:-1] - 2 * (1 - x[1:-1])) der[0] = -400 * x[0] * (x[1] - x[0] ** 2) - 2 * (1 - x[0]) der[-1] = 200 * (x[-1] - x[-2] ** 2) return der def rosen_der_loops(x): n = x.shape[0] der = np.empty_like(x) for i in range(1, n - 1): der[i] = (+ 200 * (x[i] - x[i - 1] ** 2) - 400 * (x[i + 1] - x[i] ** 2) * x[i] - 2 * (1 - x[i])) der[0] = -400 * x[0] * (x[1] - x[0] ** 2) - 2 * (1 - x[0]) der[-1] = 200 * (x[-1] - x[-2] ** 2) return der if __name__ == '__main__': N = 10**5 x = np.arange(N) / float(N) jit(rosen_der_np)(x) from compare_perf import compare_perf # numba still crashes on negative indexing compare_perf(rosen_der_np, [x.copy()], numba=False) compare_perf(rosen_der_loops, [x.copy()], numba=False)
def local_maxima(data, wsize, mode=wrap): result = np.ones(shape=data.shape,dtype=bool) for pos in np.ndindex(data.shape): myval = data[pos] for offset in np.ndindex(wsize): neighbor_idx = tuple(mode(p, o-w/2, w) for (p, o, w) in zip(pos, offset, wsize)) result[pos] &= (data[neighbor_idx] <= myval) return result @parakeet.jit def parakeet_local_maxima(data, wsize, mode=wrap): def is_max(pos): def is_smaller_neighbor(offset): neighbor_idx = tuple(mode(p, o-w/2, w) for (p, o, w) in zip(pos, offset, wsize)) return data[neighbor_idx] <= data[pos] return np.all(parakeet.imap(is_smaller_neighbor, wsize)) return parakeet.imap(is_max, data.shape) # not sure how to get numba to auto-jit size generic code # get error: "FAILED with KeyError 'sized_pointer(npy_intp, 4)'" #import numba #numba_local_maxima = numba.autojit(python_local_maxima) from compare_perf import compare_perf shape = (30,30,20,12) x = np.random.randn(*shape) compare_perf(local_maxima, [x, shape])
import numpy as np def dot(x,y): return sum(x*y) def matmult_high_level(X,Y): return np.array([[np.dot(x,y) for y in Y.T] for x in X]) def matmult_loops(X,Y,Z): m, d = X.shape n = Y.shape[1] for i in xrange(m): for j in xrange(n): total = X[i,0] * Y[0,j] for k in xrange(1,d): total += X[i,k] * Y[k,j] Z[i,j] = total return Z n, d = 2000, 500 m = 2000 X = np.random.randn(m,d).astype('float64') Y = np.random.randn(d,n).astype('float64') Z = np.zeros((m,n)).astype('float64') from compare_perf import compare_perf compare_perf(matmult_high_level, [X,Y],cpython=True, numba=False,extra = {'numpy':np.dot}, suppress_output = False) compare_perf(matmult_loops, [X, Y, Z], cpython=False)
grid_x = np.linspace(-bound, bound, N) for i, x in enumerate(grid_x): for j, y in enumerate(grid_x): julia[i, j] = kernel(x, y, cr, ci, lim, cutoff=cutoff) return julia def julia(cr, ci, N, bound=1.5, lim=1000., cutoff=1e6): grid_x = np.linspace(-bound, bound, N) return np.array( [[kernel(x, y, cr, ci, lim, cutoff=cutoff) for x in grid_x] for y in grid_x]) from compare_perf import compare_perf cr = 0.285 ci = 0.01 N = 1200 bound = 1.5 lim = 1000 cutoff = 1e6 extra = {} try: from numba import autojit extra['numba'] = autojit(julia_loops) except: print "Failed to import Numba" compare_perf(julia, [cr, ci, N, bound, lim, cutoff], numba=False, extra=extra)
import numpy as np def smooth(x, alpha): s = x.copy() for i in xrange(1, len(x)): s[i] = alpha * x[i] + (1 - alpha) * s[i - 1] return s n = 10**6 alpha = 0.01 X = np.random.randn(n).astype('float32') from compare_perf import compare_perf compare_perf(smooth, [X, alpha])
curr_dist += (x[xidx] - centroid[xidx])**2 if curr_dist < min_dist: min_dist = curr_dist min_idx = cidx A[i] = min_idx # recompute the clusters by averaging data points # assigned to them for cidx in xrange(k): # reset centroids for dim_idx in xrange(ndims): C[cidx, dim_idx] = 0 # add each data point only to its assigned centroid cluster_count = 0 for i in xrange(n): if A[i] == cidx: C[cidx, :] += X[i, :] cluster_count += 1 C[cidx, :] /= cluster_count return C n, d = 10**4, 50 X = np.random.randn(n, d) k = 25 from compare_perf import compare_perf compare_perf(kmeans_comprehensions, [X, k, 5], cpython=False) compare_perf(kmeans_loops, [X, k, 5], cpython=True)
# # Longest hailstone sequence from http://www.mit.edu/~mtikekar/posts/stream-fusion.html # import sys def collatzLen(a0): a = a0 length = 0 while a != 1: a = (a if a % 2 == 0 else 3 * a + 1) / 2 length += 1 return length def maxLen(max_a0): max_length = 0 longest = 0 for a0 in xrange(1, max_a0 + 1): length = collatzLen(a0) if length > max_length: max_length = length longest = a0 return max_length, longest from compare_perf import compare_perf compare_perf(maxLen, [1000000])
# h = max_d_curr*.5 #h = max(h,0.55*dx) # particle pixel center xpos = physical_to_pixel(x, xmin, dx) ypos = physical_to_pixel(y, ymin, dy) left = xpos - k / 2 upper = ypos - k / 2 for i in xrange(0, k): for j in xrange(0, k): if ((i + left >= 0) and (i + left < nx) and (j + upper >= 0) and (j + upper < ny)): image[(i + left), (j + upper)] += kernel[i, j] * qt start_ind = end_ind return image N = 20 x = y = z = hs = qts = mass = rhos = np.random.rand(N) nx = ny = 100 args = (x, y, qts, hs, nx, ny, 0.0, 1.0, 0.0, 1.0, 1) template_kernel_cpu(*args) from compare_perf import compare_perf compare_perf(template_kernel_cpu, args)
# # Longest hailstone sequence from http://www.mit.edu/~mtikekar/posts/stream-fusion.html # import sys def collatzLen(a0): a = a0 length = 0 while a != 1: a = (a if a%2 == 0 else 3*a+1) / 2 length += 1 return length def maxLen(max_a0): max_length = 0 longest = 0 for a0 in xrange(1, max_a0 + 1): length = collatzLen(a0) if length > max_length: max_length = length longest = a0 return max_length, longest from compare_perf import compare_perf compare_perf(maxLen, [1000000])
for ii in xrange(max(i-window_radius, 0), min(i+window_radius+1, height)): if ii != i or jj != j: d = image[i, j, :] - image[ii, jj, :] s = np.sum(d**2) gval = 1.0 - np.sqrt(s) / np.sqrt(3) attack_strength = gval * state[ii, jj, 1] if attack_strength > defense_strength: defense_strength = attack_strength winning_colony = state[ii, jj, 0] changes += 1 state_next[i, j, 0] = winning_colony state_next[i, j, 1] = defense_strength return changes N = 50 dtype = np.double image = np.zeros((N, N, 3), dtype=dtype) state = np.zeros((N, N, 2), dtype=dtype) state_next = np.empty_like(state) # colony 1 is strength 1 at position 0,0 # colony 0 is strength 0 at all other positions state[0, 0, 0] = 1 state[0, 0, 1] = 1 window_radius = 10 from compare_perf import compare_perf compare_perf(growcut_python, [image, state, state_next, window_radius])
if(x_pix_start < 0): x_pix_start = 0 if(x_pix_stop > nx): x_pix_stop = int32(nx-1) if(y_pix_start < 0): y_pix_start = 0 if(y_pix_stop > ny): y_pix_stop = int32(ny-1) for xpix in range(x_pix_start, x_pix_stop) : for ypix in range(y_pix_start, y_pix_stop) : # physical coordinates of pixel xpixel = pixel_to_physical(xpix,x_start,dx) ypixel = pixel_to_physical(ypix,y_start,dy) zpixel = zplane dxpix, dypix, dzpix = [x-xpixel,y-ypixel,z-zpixel] d = distance(dxpix,dypix,dzpix) if (d/h < 2) : kernel_val = kernel_vals[int(d/(.01*h))]/(h*h*h) image[xpix,ypix] += qt*kernel_val return image from compare_perf import compare_perf N = 160 x = y = z = hs= qts = mass = rhos = np.random.rand(N) nx=ny=80 args = (x,y,z,hs,qts,mass,rhos,nx,ny, 0.0, 1.0, 0.0, 1.0) compare_perf(render_image, args, numba = True, backends= ('c',))
julia = np.empty((N, N), dtype=np.uint32) grid_x = np.linspace(-bound, bound, N) for i, x in enumerate(grid_x): for j, y in enumerate(grid_x): julia[i,j] = kernel(x, y, cr, ci, lim, cutoff=cutoff) return julia def julia(cr, ci, N, bound=1.5, lim=1000., cutoff=1e6): grid_x = np.linspace(-bound, bound, N) return np.array([[kernel(x,y,cr,ci,lim,cutoff=cutoff) for x in grid_x] for y in grid_x]) from compare_perf import compare_perf cr=0.285 ci=0.01 N=1200 bound = 1.5 lim = 1000 cutoff = 1e6 extra = {} try: from numba import autojit extra['numba'] = autojit(julia_loops) except: print "Failed to import Numba" compare_perf(julia, [cr, ci, N, bound, lim, cutoff], numba = False, extra = extra)
if(x_pix_start < 0): x_pix_start = 0 if(x_pix_stop > nx): x_pix_stop = int32(nx-1) if(y_pix_start < 0): y_pix_start = 0 if(y_pix_stop > ny): y_pix_stop = int32(ny-1) for xpix in range(x_pix_start, x_pix_stop) : for ypix in range(y_pix_start, y_pix_stop) : # physical coordinates of pixel xpixel = pixel_to_physical(xpix,x_start,dx) ypixel = pixel_to_physical(ypix,y_start,dy) zpixel = zplane dxpix, dypix, dzpix = [x-xpixel,y-ypixel,z-zpixel] d = distance(dxpix,dypix,dzpix) if (d/h < 2) : kernel_val = kernel_vals[int(d/(.01*h))]/(h*h*h) image[xpix,ypix] += qt*kernel_val return image from compare_perf import compare_perf N = 1600 x = y = z = hs= qts = mass = rhos = np.random.rand(N) nx=ny=40 args = (x,y,z,hs,qts,mass,rhos,nx,ny, 0.0, 1.0, 0.0, 1.0) compare_perf(render_image, args)
def harris(I): m,n = I.shape dx = (I[1:, :] - I[:m-1, :])[:, 1:] dy = (I[:, 1:] - I[:, :n-1])[1:, :] # # At each point we build a matrix # of derivative products # M = # | A = dx^2 C = dx * dy | # | C = dy * dx B = dy * dy | # # and the score at that point is: # det(M) - k*trace(M)^2 # A = dx * dx B = dy * dy C = dx * dy tr = A + B det = A * B - C * C k = 0.05 return det - k * tr * tr from compare_perf import compare_perf m,n = 1920, 1080 dtype = 'float64' I = np.random.randn(m,n).astype(dtype) compare_perf(harris, [I], propagate_exceptions=True, backends= ("c", "openmp"))
from parakeet import jit, config, c_backend def covariance(x,y): return ((x-x.mean()) * (y-y.mean())).mean() def fit_simple_regression(x,y): slope = covariance(x,y) / covariance(x,x) offset = y.mean() - slope * x.mean() return slope, offset import numpy as np N = 10**7 x = np.random.randn(N).astype('float64') slope = 903.29 offset = 102.1 y = slope * x + offset from compare_perf import compare_perf compare_perf(fit_simple_regression, (x,y))
x = np.random.randn(1500,1500).astype('float32') w = np.random.randn(3,3).astype('float32') #compare_perf(conv_3x3_trim, [x,w]) w = np.random.randn(3,3).astype('float32') # Simple convolution of 5x5 patches from a given array x # by a 5x5 array of filter weights def conv_3x3_trim_loops(image, weights): result = np.zeros_like(image) for i in xrange(1,x.shape[0]-1): for j in xrange(1,x.shape[1]-1): for ii in xrange(3): for jj in xrange(3): result[i,j] += image[i-ii+1, j-jj+1] * weights[ii, jj] return result compare_perf(conv_3x3_trim_loops, [x,w]) import parakeet def conv_3x3_imap(image, weights): def compute((i,j)): total = np.float32(0.0) for ii in xrange(3): for jj in xrange(3): total += image[i+ii-1, j + jj - 1] * weights[ii, jj] return total w,h = image.shape return parakeet.imap(compute, (w-2,h-2)) compare_perf(conv_3x3_imap, [x,w], backends=('openmp', 'cuda',))
if (x_pix_stop > nx): x_pix_stop = int32(nx - 1) if (y_pix_start < 0): y_pix_start = 0 if (y_pix_stop > ny): y_pix_stop = int32(ny - 1) for xpix in range(x_pix_start, x_pix_stop): for ypix in range(y_pix_start, y_pix_stop): # physical coordinates of pixel xpixel = pixel_to_physical(xpix, x_start, dx) ypixel = pixel_to_physical(ypix, y_start, dy) zpixel = zplane dxpix, dypix, dzpix = [ x - xpixel, y - ypixel, z - zpixel ] d = distance(dxpix, dypix, dzpix) if (d / h < 2): kernel_val = kernel_vals[int( d / (.01 * h))] / (h * h * h) image[xpix, ypix] += qt * kernel_val return image from compare_perf import compare_perf N = 160 x = y = z = hs = qts = mass = rhos = np.random.rand(N) nx = ny = 80 args = (x, y, z, hs, qts, mass, rhos, nx, ny, 0.0, 1.0, 0.0, 1.0) compare_perf(render_image, args, numba=True, backends=('c', ))
import numpy as np def dot(x,y): return np.min(x+y) def matmult_high_level(X,Y): return np.array([[dot(x,y) for y in Y.T] for x in X]) def matmult_loops(X,Y,Z): m, d = X.shape n = Y.shape[1] for i in xrange(m): for j in xrange(n): total = X[i,0] + Y[0,j] for k in xrange(1,d): total = min(total, X[i,k] + Y[k,j]) Z[i,j] = total return Z n, d = 500, 500 m = 500 X = np.random.randn(m,d) Y = np.random.randn(d,n) Z = np.zeros((m,n)) from compare_perf import compare_perf compare_perf(matmult_high_level, [X,Y], cpython=True, numba=False) compare_perf(matmult_loops, [X, Y, Z], cpython=False)
# set the minimum h to be equal to half pixel width # h = max_d_curr*.5 #h = max(h,0.55*dx) # particle pixel center xpos = physical_to_pixel(x,xmin,dx) ypos = physical_to_pixel(y,ymin,dy) left = xpos-k/2 upper = ypos-k/2 for i in xrange(0,k) : for j in xrange(0,k): if ((i+left>=0) and (i+left < nx) and (j+upper >=0) and (j+upper<ny)) : image[(i+left),(j+upper)] += kernel[i,j]*qt start_ind = end_ind return image N = 20 x = y = z = hs= qts = mass = rhos = np.random.rand(N) nx=ny=100 args = (x, y, qts,hs, nx, ny, 0.0, 1.0, 0.0, 1.0,1) template_kernel_cpu(*args) from compare_perf import compare_perf compare_perf(template_kernel_cpu, args)
def harris(I): m,n = I.shape dx = (I[1:, :] - I[:m-1, :])[:, 1:] dy = (I[:, 1:] - I[:, :n-1])[1:, :] # # At each point we build a matrix # of derivative products # M = # | A = dx^2 C = dx * dy | # | C = dy * dx B = dy * dy | # # and the score at that point is: # det(M) - k*trace(M)^2 # A = dx * dx B = dy * dy C = dx * dy tr = A + B det = A * B - C * C k = np.float32(0.05) return det - k * tr * tr from compare_perf import compare_perf m,n = 2400, 2400 dtype = 'float32' I = (np.random.randn(m,n) ** 2).astype(dtype) compare_perf(harris, [I], propagate_exceptions=True)
def matmult_loops(X,Y,Z): m, d = X.shape n = Y.shape[1] for i in xrange(m): for j in xrange(n): total = X[i,0] * Y[0,j] for k in xrange(1,d): total += X[i,k] * Y[k,j] Z[i,j] = total def call_numba(X,Y): Z = np.zeros((X.shape[0],Y.shape[1])).astype(dtype) matmult_loops(X,Y,Z) return Z extra['numba'] = call_numba except: print "Failed to import Numba" pass compare_perf(matmult_high_level, [X,Y], cpython=True, # numba can't run the nested comprehensions so we use # a special loopy version instead numba=False, extra = extra, suppress_output = False, propagate_exceptions = False)
import parakeet def growcut_par(image, state, window_radius): height = image.shape[0] width = image.shape[1] def attack((i,j)): winning_colony = state[i, j, 0] defense_strength = state[i, j, 1] for jj in xrange(max(j-window_radius,0), min(j+window_radius+1, width)): for ii in xrange(max(i-window_radius, 0), min(i+window_radius+1, height)): if ii != i or jj != j: d = image[i, j, :] - image[ii, jj, :] s = np.sum(d**2) gval = 1.0 - np.sqrt(s) / np.sqrt(3) attack_strength = gval * state[ii, jj, 1] if attack_strength > defense_strength: defense_strength = attack_strength winning_colony = state[ii, jj, 0] return [winning_colony, defense_strength] return parakeet.imap(attack, (height, width)) from compare_perf import compare_perf import time t = time.time() growcut_python(image, state, state_next, window_radius) t2 = time.time() print "Python time", t2 - t compare_perf(growcut_par, [image, state, window_radius], suppress_output = False, propagate_exceptions = True)
''' Computes the number of iterations `n` such that |z_n| > `lim`, where `z_n = z_{n-1}**2 + c`. ''' count = 0 while ((zr*zr + zi*zi) < (lim*lim)) and count < cutoff: zr, zi = zr * zr - zi * zi + cr, 2 * zr * zi + ci count += 1 return count def julia_loops(cr, ci, N, bound=1.5, lim=1000., cutoff=1e6): ''' Pure Python calculation of the Julia set for a given `c`. No NumPy array operations are used. ''' julia = np.empty((N, N), dtype=np.uint32) grid_x = np.linspace(-bound, bound, N) for i, x in enumerate(grid_x): for j, y in enumerate(grid_x): julia[i,j] = kernel(x, y, cr, ci, lim, cutoff=cutoff) return julia from compare_perf import compare_perf cr=0.285 ci=0.01 N=100 bound = 1.5 lim = 1000 cutoff = 1e6 compare_perf(julia_loops, [cr, ci, N, bound, lim, cutoff])
from parakeet import jit, config, c_backend def covariance(x,y): return ((x-x.mean()) * (y-y.mean())).mean() def fit_simple_regression(x,y): slope = covariance(x,y) / covariance(x,x) offset = y.mean() - slope * x.mean() return slope, offset import numpy as np N = 2*10**7 x = np.random.randn(N).astype('float64') slope = 903.29 offset = 102.1 y = slope * x + offset from compare_perf import compare_perf compare_perf(fit_simple_regression, (x,y), numba=True)
pzi = points[i, 2] total = 0.0 for j in xrange(n_weights): weight_j = weights[j] xj = pos[j, 0] yj = pos[j, 1] zj = pos[j, 2] dx = pxi - pos[j, 0] dy = pyi - pos[j, 1] dz = pzi - pos[j, 2] dr = 1.0 / np.sqrt(dx * dx + dy * dy + dz * dz) total += weight_j * dr sum_array3d[i, 0] += weight_j * dx sum_array3d[i, 1] += weight_j * dy sum_array3d[i, 2] += weight_j * dz return total sum_array = np.array([compute(i) for i in xrange(n_points)]) return sum_array, sum_array3d n_points = 200 n_weights = 400 pos = np.random.randn(n_weights, 3) weights = np.random.randn(n_weights) points = np.random.randn(n_points, 3) from compare_perf import compare_perf compare_perf(summation, [pos, weights, points])
for offset in np.ndindex(wsize): neighbor_idx = tuple( mode(p, o - w / 2, w) for (p, o, w) in zip(pos, offset, wsize)) result[pos] &= (data[neighbor_idx] <= myval) return result @parakeet.jit def parakeet_local_maxima(data, wsize, mode=wrap): def is_max(pos): def is_smaller_neighbor(offset): neighbor_idx = tuple( mode(p, o - w / 2, w) for (p, o, w) in zip(pos, offset, wsize)) return data[neighbor_idx] <= data[pos] return np.all(parakeet.imap(is_smaller_neighbor, wsize)) return parakeet.imap(is_max, data.shape) # not sure how to get numba to auto-jit size generic code # get error: "FAILED with KeyError 'sized_pointer(npy_intp, 4)'" #import numba #numba_local_maxima = numba.autojit(python_local_maxima) from compare_perf import compare_perf shape = (30, 30, 20, 12) x = np.random.randn(*shape) compare_perf(local_maxima, [x, shape])
from parakeet import jit, config, c_backend def covariance(x, y): return ((x - x.mean()) * (y - y.mean())).mean() def fit_simple_regression(x, y): slope = covariance(x, y) / covariance(x, x) offset = y.mean() - slope * x.mean() return slope, offset import numpy as np N = 2 * 10**7 x = np.random.randn(N).astype('float64') slope = 903.29 offset = 102.1 y = slope * x + offset from compare_perf import compare_perf compare_perf(fit_simple_regression, (x, y), numba=True)
for xidx in xrange(ndims): curr_dist += (x[xidx] - centroid[xidx])**2 if curr_dist < min_dist: min_dist = curr_dist min_idx = cidx A[i] = min_idx # recompute the clusters by averaging data points # assigned to them for cidx in xrange(k): # reset centroids for dim_idx in xrange(ndims): C[cidx, dim_idx] = 0 # add each data point only to its assigned centroid cluster_count = 0 for i in xrange(n): if A[i] == cidx: C[cidx, :] += X[i, :] cluster_count += 1 C[cidx, :] /= cluster_count return C n, d = 10**4, 50 X = np.random.randn(n,d) k = 25 from compare_perf import compare_perf compare_perf(kmeans_comprehensions, [X, k, 5],cpython=False) compare_perf(kmeans_loops, [X, k, 5], cpython=True)
for i in range(steps): previous_grid[:, :] = old_grid old_grid[:, :] = grid for x in range(l_x): for y in range(l_y): grid[x,y] = 0.0 if x + 1 < l_x: grid[x,y] += old_grid[x+1,y] if 0 < x-1 and x - 1 < l_x: grid[x,y] += old_grid[x-1,y] if y+1 < l_y: grid[x,y] += old_grid[x,y+1] if 0 < y-1 and y-1 < l_y: grid[x,y] += old_grid[x,y-1] grid[x,y] /= 2.0 grid[x,y] -= previous_grid[x,y] return grid N = 1000 steps = 20 input_grid = np.random.randn(N,N).astype('float64') import parakeet parakeet.config.print_generated_code = True from compare_perf import compare_perf compare_perf(fdtd, [input_grid, steps], backends = ('c', 'openmp', 'cuda'))
def harris(I): m, n = I.shape dx = (I[1:, :] - I[:m - 1, :])[:, 1:] dy = (I[:, 1:] - I[:, :n - 1])[1:, :] # # At each point we build a matrix # of derivative products # M = # | A = dx^2 C = dx * dy | # | C = dy * dx B = dy * dy | # # and the score at that point is: # det(M) - k*trace(M)^2 # A = dx * dx B = dy * dy C = dx * dy tr = A + B det = A * B - C * C k = np.float32(0.05) return det - k * tr * tr from compare_perf import compare_perf m, n = 2400, 2400 dtype = 'float32' I = (np.random.randn(m, n)**2).astype(dtype) compare_perf(harris, [I], propagate_exceptions=True)
l_x = grid.shape[0] l_y = grid.shape[1] for i in range(steps): previous_grid[:, :] = old_grid old_grid[:, :] = grid for x in range(l_x): for y in range(l_y): grid[x,y] = 0.0 if x + 1 < l_x: grid[x,y] += old_grid[x+1,y] if 0 < x-1 and x - 1 < l_x: grid[x,y] += old_grid[x-1,y] if y+1 < l_y: grid[x,y] += old_grid[x,y+1] if 0 < y-1 and y-1 < l_y: grid[x,y] += old_grid[x,y-1] grid[x,y] /= 2.0 grid[x,y] -= previous_grid[x,y] return grid N = 1000 steps = 20 input_grid = np.random.randn(N,N).astype('float32') from compare_perf import compare_perf compare_perf(fdtd, [input_grid, steps], backends = ('c', 'openmp', 'cuda'))
""" Accepted response on stack overflow by phillip """ gg = np.outer(g, g) gggg = np.outer(gg, gg).reshape(4 * g.shape) axes = ((0, 2, 4, 6), (0, 1, 2, 3)) return np.tensordot(gggg, T, axes) T = np.random.randn(n, n, n, n) g = np.random.randn(n, n) from compare_perf import compare_perf compare_perf( rotT_loops, [T, g], extra={"numpy_tensordot": rotT_numpy}, numba=False, backends=("c", "openmp"), cpython=True ) def rotT_par(T, g): def compute_elt(i, j, k, l): total = 0.0 for ii in range(n): for jj in range(n): for kk in range(n): for ll in range(n): gg = g[ii, i] * g[jj, j] * g[kk, k] * g[ll, l] total += gg * T[ii, jj, kk, ll] return total return np.array(
u[i, j] = mu * (temp_u[i + 1, j] + temp_u[i - 1, j] + temp_u[i, j + 1] + temp_u[i, j - 1] - 4 * temp_u[i, j]) temp = u u = temp_u temp_u = temp return u def diffuse_array_expressions(iter_num): u = np.zeros((Lx, Ly), dtype=np.float64) temp_u = np.zeros_like(u) temp_u[Lx / 2, Ly / 2] = 1000.0 for i in range(iter_num): u[1:-1, 1:-1] = mu * (temp_u[2:, 1:-1] + temp_u[:-2, 1:-1] + temp_u[1:-1, 2:] + temp_u[1:-1, :-2] - 4 * temp_u[1:-1, 1:-1]) temp = u u = temp_u temp_u = temp return u from compare_perf import compare_perf compare_perf(diffuse_loops, [N], numba=True) compare_perf( diffuse_array_expressions, [N], numba =True)
pxi = points[i, 0] pyi = points[i, 1] pzi = points[i, 2] total = 0.0 for j in xrange(n_weights): weight_j = weights[j] xj = pos[j,0] yj = pos[j,1] zj = pos[j,2] dx = pxi - pos[j, 0] dy = pyi - pos[j, 1] dz = pzi - pos[j, 2] dr = 1.0/np.sqrt(dx*dx + dy*dy + dz*dz) total += weight_j * dr sum_array3d[i,0] += weight_j * dx sum_array3d[i,1] += weight_j * dy sum_array3d[i,2] += weight_j * dz return total sum_array = np.array([compute(i) for i in xrange(n_points)]) return sum_array, sum_array3d n_points = 200 n_weights = 400 pos = np.random.randn(n_weights, 3) weights = np.random.randn(n_weights) points = np.random.randn(n_points, 3) from compare_perf import compare_perf compare_perf(summation, [pos, weights, points])
# Simple convolution of 5x5 patches from a given array x # by a 5x5 array of filter weights def conv_3x3_trim_loops(image, weights): result = np.zeros_like(image) for i in xrange(1, x.shape[0] - 1): for j in xrange(1, x.shape[1] - 1): for ii in xrange(3): for jj in xrange(3): result[i, j] += image[i - ii + 1, j - jj + 1] * weights[ii, jj] return result compare_perf(conv_3x3_trim_loops, [x, w]) import parakeet def conv_3x3_imap(image, weights): def compute((i, j)): total = np.float32(0.0) for ii in xrange(3): for jj in xrange(3): total += image[i + ii - 1, j + jj - 1] * weights[ii, jj] return total w, h = image.shape return parakeet.imap(compute, (w - 2, h - 2))
import numpy as np from compare_perf import compare_perf # Simple convolution of 3x3 patches from a given array x # by a 3x3 array of filter weights def conv_3x3_trim(x, weights): return np.array([[(x[i-1:i+2, j-1:j+2]*weights).sum() for j in xrange(1, x.shape[1] -2)] for i in xrange(1, x.shape[0] -2)]) x = np.random.randn(1200,1200).astype('float32') w = np.random.randn(3,3).astype('float32') compare_perf(conv_3x3_trim, [x,w]) w = np.random.randn(3,3).astype('float32') # Simple convolution of 5x5 patches from a given array x # by a 5x5 array of filter weights def conv_3x3_trim_loops(image, weights): result = np.zeros_like(image) for i in xrange(1,x.shape[0]-1): for j in xrange(1,x.shape[1]-1): for ii in xrange(3): for jj in xrange(3): result[i,j] += image[i-ii+1, j-jj+1] * weights[ii, jj] return result
temp_u[i, j + 1] + temp_u[i, j - 1] - 4 * temp_u[i, j]) temp = u u = temp_u temp_u = temp return u def diffuse_array_expressions(iter_num): u = np.zeros((Lx, Ly), dtype=np.float64) temp_u = np.zeros_like(u) temp_u[Lx / 2, Ly / 2] = 1000.0 for i in range(iter_num): u[1:-1, 1:-1] = mu * (temp_u[2:, 1:-1] + temp_u[:-2, 1:-1] + temp_u[1:-1, 2:] + temp_u[1:-1, :-2] - 4 * temp_u[1:-1, 1:-1]) temp = u u = temp_u temp_u = temp return u from compare_perf import compare_perf compare_perf(diffuse_loops, [N], numba=True) compare_perf(diffuse_array_expressions, [N], numba=True)
return distance_matrix def arc_distance_numpy_broadcast(a, b): """ Calculates the pairwise arc distance between all points in vector a and b. """ theta_1 = a[:, 0][:, None] theta_2 = b[:, 0][None, :] phi_1 = a[:, 1][:, None] phi_2 = b[:, 1][None, :] temp = (np.sin((theta_2 - theta_1) / 2)**2 + np.cos(theta_1) * np.cos(theta_2) * np.sin((phi_2 - phi_1) / 2)**2) distance_matrix = 2 * (np.arctan2(np.sqrt(temp), np.sqrt(1 - temp))) return distance_matrix from compare_perf import compare_perf n = 1000 import numpy as np a = np.random.rand(n, 2) b = np.random.rand(n, 2) compare_perf(arc_distance_python_nested_for_loops, [a,b]) compare_perf(arc_distance_numpy_broadcast, [a,b]) compare_perf(arc_distance_numpy_tile, [a,b])