def runTP_Sims(A_input, b_input, x_input, probDist, maxIters, errorType=0, utType="Reg", tpType="Reg"): numStates, numFeatures = A_input.shape A_proc, b_proc = np.array(A_input, copy=True), np.array(b_input, copy=True) x_first = np.array(x_input, copy=True).reshape(numFeatures) x_last = x_first x_proc = np.array(x_input, copy=True).reshape(numFeatures) x_prev = x_proc # Error vectors errors = np.zeros(maxIters) errors[0] = errorCalcs.getErrorMethod(A_proc, b_proc, x_proc, probDist, errorType, norm=2) iters = 1 while iters < maxIters: tp1 = TPCore.TPAlgo(A_proc, b_proc, x_proc) tp2 = TPCore.TPAlgo(A_proc, b_proc, tp1) dTP1 = tp1 - x_proc dTP2 = tp2 - tp1 ddTP = dTP2 - dTP1 kappa = utils.twoNorm(ddTP) / utils.twoNorm(dTP1) alpha = 1 / kappa x_proc = x_proc + alpha * (tp1 - x_proc) errors[iters] = errorCalcs.getErrorMethod(A_proc, b_proc, x_proc, probDist, errorType, norm=2) iters += 1 return x_proc, errors
def ConstantMomentum(A_input, b_input, x_input, probDist, maxIters, errorType=0, utType="Reg", tpType="Reg", momentumMult=0.1): numStates, numFeatures = A_input.shape A_proc, b_proc = np.array(A_input, copy=True), np.array(b_input, copy=True) x_first = np.array(x_input, copy=True).reshape(numFeatures) x_last = x_first x_proc = np.array(x_input, copy=True).reshape(numFeatures) x_prev = x_proc # Error vectors errors = np.zeros(maxIters) errors[0] = errorCalcs.getErrorMethod(A_proc, b_proc, x_proc, probDist, errorType, norm=2) iters = 1 gamma = 0.9 while iters < maxIters: numSamples = 3 sampledRows = random.choice(numStates, numSamples, p=probDist) tp1 = TPCore.TPAlgosampledRows(A_proc, b_proc, x_proc, sampledRows=sampledRows) tp2 = TPCore.TPAlgosampledRows(A_proc, b_proc, tp1, sampledRows=sampledRows) dTP1 = tp1 - x_proc dTP2 = tp2 - tp1 ddTP = dTP2 - dTP1 kappa = utils.twoNorm(ddTP) / utils.twoNorm(dTP1) # Radius of osculating circle radius = 1 / kappa momentumMult = 0.1 momentumTerm = x_proc - x_prev x_prev = x_proc alpha = 1 / (iters * numSamples / numStates + 1) x_proc = x_proc + alpha * radius * (dTP1) + momentumMult * ( momentumTerm) errors[iters] = errorCalcs.getErrorMethod(A_proc, b_proc, x_proc, probDist, errorType, norm=2) iters += 1 return x_proc, errors
def TPAlgoNormDistance(A, b, xk, norm): m = len(A) x_kPlusOne = np.array(xk, copy=True) for i in range(m): Ai = A[i, :] x_kPlusOne = x_kPlusOne - (Ai * (Ai.dot(xk) - b[i]) * (utils.twoNorm(Ai.dot(xk) - b[i])** (norm - 2))) / (m**(norm - 1) * ( (utils.twoNorm(Ai))**norm)) return x_kPlusOne
def getErrorfromAb_SumDistanceFromHPs(A, b, x, probDist): x_flat = x.reshape(x.shape[0]) b_flat = b.reshape(b.shape[0]) numRows = b.shape[0] for i in range(numRows): Ai_twonorm = utils.twoNorm(A[i, :]) A[i, :] = A[i, :] / Ai_twonorm b_flat[i] = b_flat[i] / Ai_twonorm error_per_coordinate = A.dot(x_flat) - b_flat diagStatDist = np.diag(probDist) weighted_error_per_coordinate = diagStatDist @ error_per_coordinate return utils.twoNorm(weighted_error_per_coordinate)
def getErrorfromAb(A, b, x, probDist): x_flat = x.reshape(x.shape[0]) b_flat = b.reshape(b.shape[0]) error_per_coordinate = np.abs(A.dot(x_flat) - b_flat) diagprobDist = np.diag(probDist) weighted_error_per_coordinate = diagprobDist @ error_per_coordinate return utils.twoNorm(weighted_error_per_coordinate)
def Adagrad(A_input, b_input, x_input, probDist, maxIters, errorType=0, utType="Reg", tpType="Reg",momentumMult=0.1): numStates,numFeatures = A_input.shape A_proc, b_proc = np.array(A_input, copy=True), np.array(b_input, copy=True) x_first = np.array(x_input, copy=True).reshape(numFeatures) x_last = x_first x_proc = np.array(x_input, copy=True).reshape(numFeatures) x_prev = x_proc # Error vectors errors = np.zeros(maxIters) errors[0] = errorCalcs.getErrorMethod(A_proc, b_proc, x_proc, probDist, errorType, norm=2) iters = 1 gamma = 0.9 Mean_Squared_gradient = np.zeros(x_proc.shape) while iters < maxIters: numSamples = 3 sampledRows = random.choice(numStates,numSamples,p=probDist) tp1 = TPCore.TPAlgosampledRows(A_proc, b_proc, x_proc,sampledRows=sampledRows) tp2 = TPCore.TPAlgosampledRows(A_proc, b_proc, tp1,sampledRows=sampledRows) dTP1 = tp1 - x_proc dTP2 = tp2 - tp1 ddTP = dTP2 - dTP1 kappa = utils.twoNorm(ddTP) / utils.twoNorm(dTP1) # Radius of osculating circle radius = 1 / kappa momentumTerm = x_proc - x_prev x_prev = x_proc alpha = 1/(iters*numSamples/numStates+1) step = (alpha * radius* (dTP1)) Mean_Squared_gradient = Mean_Squared_gradient + dTP1**2 epsilon = 1e-6*np.ones(Mean_Squared_gradient.shape) Delta = (step)/(Mean_Squared_gradient+epsilon)**0.5 # Delta = ((dTP1))/Mean_Squared_gradient**0.5 x_proc = x_proc + Delta errors[iters] = errorCalcs.getErrorMethod(A_proc, b_proc, x_proc, probDist, errorType, norm=2) iters += 1 return x_proc, errors
def getWeightedErrorfromPhi(weights, phi, cost, gamma, tm, statDist, errorNorm=2): A = phi - gamma * tm @ phi row2Norm = np.apply_along_axis(utils.twoNorm, 1, (phi - gamma * tm @ phi)) obrow2Norm = np.diag((1 / row2Norm)) b = np.array(cost, copy=True) weights = weights.reshape(weights.shape[0]) b = b.reshape(b.shape[0]) numRows = A.shape[0] # BIG NOTE: PLEASE CHECK WHETHER ROW NORMALIZATION IN CHECKING ERRORS IS OK> # ROW NORMALIZATION LEADS TO PURE DISTANCES BETWEEN HYPERPLANES. SO IS A GOOD MEASURE. for i in range(numRows): Ai_twonorm = utils.twoNorm(A[i, :]) A[i, :] = A[i, :] / Ai_twonorm b[i] = b[i] / Ai_twonorm Adotx = A.dot(weights) errors = np.abs(Adotx - b).reshape(b.shape[0]) weightedError = np.Infinity try: weightedError = np.diag(statDist) @ errors except: print("An exception occurred") print("A=", A.shape, "b=", b.shape, "adotx", Adotx.shape, "errors = ", errors.shape) if errorNorm == np.inf: return np.linalg.norm(weightedError, ord=np.inf) elif errorNorm == 2: return utils.twoNorm(weightedError) else: return np.mean(weightedError)
def NADAM(A_input, b_input, x_input, probDist, maxIters, errorType=0, utType="Reg", tpType="Reg", momentumMult=0.1): numStates, numFeatures = A_input.shape A_proc, b_proc = np.array(A_input, copy=True), np.array(b_input, copy=True) x_first = np.array(x_input, copy=True).reshape(numFeatures) x_last = x_first x_proc = np.array(x_input, copy=True).reshape(numFeatures) x_prev = x_proc # Error vectors errors = np.zeros(maxIters) errors[0] = errorCalcs.getErrorMethod(A_proc, b_proc, x_proc, probDist, errorType, norm=2) iters = 1 beta2 = 0.999 beta1 = 0.9 meanSquareGradientAccumulator = np.zeros(x_proc.shape) momentumAccumulator = np.zeros(x_proc.shape) while iters < maxIters: numSamples = 3 sampledRows = random.choice(numStates, numSamples, p=probDist) tp1 = TPCore.TPAlgosampledRows(A_proc, b_proc, x_proc, sampledRows=sampledRows) tp2 = TPCore.TPAlgosampledRows(A_proc, b_proc, tp1, sampledRows=sampledRows) dTP1 = tp1 - x_proc dTP2 = tp2 - tp1 ddTP = dTP2 - dTP1 kappa = utils.twoNorm(ddTP) / (utils.twoNorm(dTP1))**2 # Radius of osculating circle radius = 1 / kappa radiusByNorm_dTP1 = radius / utils.twoNorm(dTP1) alpha = 1 / (iters * numSamples / numStates + 1) alpha = alpha * radiusByNorm_dTP1 # Notice that we have multiplied and divided by utils.twoNorm(dTP1) one, # which was done for clarity and may be skipped. momentumAccumulator = beta1 * momentumAccumulator + (1 - beta1) * dTP1 meanSquareGradientAccumulator = beta2 * meanSquareGradientAccumulator + ( 1 - beta2) * dTP1**2 mHat = momentumAccumulator / (1 - beta1**(iters + 1)) vHat = meanSquareGradientAccumulator / (1 - beta2**(iters + 1)) epsilon = 1e-6 mUpdater = beta1 * mHat + (1 - beta1) / (1 - beta1**(iters + 1)) * dTP1 momentumTerm = alpha * mUpdater / ( (vHat)**0.5 + epsilon) - alpha * dTP1 x_prev = x_proc x_proc = x_proc + momentumTerm errors[iters] = errorCalcs.getErrorMethod(A_proc, b_proc, x_proc, probDist, errorType, norm=2) iters += 1 return x_proc, errors
def CodeRunner(A_input, b_input, x_input, probDist, maxIters, errorType=0, stochasticType="Stochastic", stepSizeType="Curvature Step", momentumType="Heavy Ball Momentum", momentumParam=0.5, momentumParam2=0.99): # We first collect the size of the linear system numStates, numFeatures = A_input.shape # We use the below two placeholders so as to not touch the original matrices A and b A_proc, b_proc = np.array(A_input, copy=True), np.array(b_input, copy=True) # Our working iterate is called x_proc x_proc = np.array(x_input, copy=True).reshape(numFeatures) # Our previous iterate is stored in x_prev x_prev = x_proc # Error vectors. We will store our errors in these errors = np.zeros(maxIters) errors[0] = errorCalcs.getErrorMethod(A_proc, b_proc, x_proc, probDist, errorType, norm=2) iters = 1 # We will initialize two holders that will be used in some of the momentum methods meanSquareGradientAccumulator = np.zeros(x_proc.shape) momentumAccumulator = np.zeros(x_proc.shape) # We run iterations updating x_proc each time while iters < maxIters: # First we check for stochastic type. Non stochastic does not sample the rows if stochasticType == "Non-Stochastic": tp1 = TPCore.TPAlgo(A_proc, b_proc, x_proc) tp2 = TPCore.TPAlgo(A_proc, b_proc, tp1) # alpha = 1 / (iters + 1) # No need for dividing by iters as we have full information here. alpha = 1 # We then check for stochastic case else: numSamples = 3 sampledRows = random.choice(numStates, numSamples, p=probDist) tp1 = TPCore.TPAlgosampledRows(A_proc, b_proc, x_proc, sampledRows=sampledRows) tp2 = TPCore.TPAlgosampledRows(A_proc, b_proc, tp1, sampledRows=sampledRows) alpha = 1 / (iters * numSamples / numStates + 1) # dTP1 is our main update dTP1 = tp1 - x_proc # In case of curvature step, there are two further variables that we will need to modify the step size # We will not be modifying the Non-Curvature step as we see good results without dividing further by m if stepSizeType == "Curvature Step": dTP2 = tp2 - tp1 ddTP = dTP2 - dTP1 kappa = utils.twoNorm(ddTP) / (utils.twoNorm(dTP1))**2 # Radius of osculating circle radius = 1 / kappa radiusByNorm_dTP1 = radius / utils.twoNorm(dTP1) # Notice that we have multiplied and divided by utils.twoNorm(dTP1) one, # which was done for clarity and may be skipped. alpha = alpha * radiusByNorm_dTP1 # We will now compute our momentum term. First the placeholder for momentum momentumTerm = np.zeros(x_proc.shape) if momentumType == "No Momentum": # In case no momentum, we simply return 0s momentumTerm = np.zeros(x_proc.shape) elif momentumType == "Heavy Ball Momentum": # In case of heavy-ball momentum, the momentum is (current iterate - previous iterate)*constant momentumTerm = x_proc - x_prev momentumTerm = momentumParam * momentumTerm x_prev = x_proc elif momentumType == "RMSProp": # In case of RMSProp, a modification of Adagrad, we use a multiplier in each direction # given by the meanSquareGradientAccumulator. This is multiplied with our original update rule # Notice that we subtract the update given by the first term # alpha * dTP1 so that we only have an RMSProp update # notice that meanSquareGradientAccumulator is a vector that we are dividing by. # Thus it is different along each axis meanSquareGradientAccumulator = momentumParam * meanSquareGradientAccumulator + \ (1 - momentumParam) * dTP1 ** 2 epsilon = 1e-6 * np.ones(meanSquareGradientAccumulator.shape) momentumTerm = alpha * dTP1 / ( (meanSquareGradientAccumulator)**0.5 + epsilon) - alpha * dTP1 elif momentumType == "Adagrad": # Adagrad is similar to RMSProp, except that the accumulator just keeps increasing. # There is no parameter required here meanSquareGradientAccumulator = meanSquareGradientAccumulator + dTP1**2 epsilon = 1e-6 * np.ones(meanSquareGradientAccumulator.shape) momentumTerm = alpha * dTP1 / ( (meanSquareGradientAccumulator)**0.5 + epsilon) - alpha * dTP1 elif momentumType == "ADAM": # Adam is closer the heavy-ball type of updates than adagrad and RMSProp. # Here the iterate moves in a direction given by mHat. momentumAccumulator = momentumParam * momentumAccumulator + ( 1 - momentumParam) * dTP1 meanSquareGradientAccumulator = momentumParam2 * meanSquareGradientAccumulator + \ (1 - momentumParam2) * dTP1 ** 2 mHat = momentumAccumulator / (1 - momentumParam**(iters + 1)) vHat = meanSquareGradientAccumulator / (1 - momentumParam2** (iters + 1)) epsilon = 1e-6 momentumTerm = alpha * mHat / ( (vHat)**0.5 + epsilon) - alpha * dTP1 elif momentumType == "Nadam": # This makes a small change to Adam to move in the direction # given by a "more current estimate" of mHat. # Many more details are found in: https://ruder.io/optimizing-gradient-descent/ momentumAccumulator = momentumParam * momentumAccumulator + ( 1 - momentumParam) * dTP1 meanSquareGradientAccumulator = momentumParam2 * meanSquareGradientAccumulator + \ (1 - momentumParam2) * dTP1 ** 2 mHat = momentumAccumulator / (1 - momentumParam**(iters + 1)) vHat = meanSquareGradientAccumulator / (1 - momentumParam2** (iters + 1)) mUpdater = momentumParam * mHat + (1 - momentumParam) / ( 1 - momentumParam**(iters + 1)) * dTP1 epsilon = 1e-6 momentumTerm = alpha * mUpdater / ( (vHat)**0.5 + epsilon) - alpha * dTP1 else: # In case no good option, we just add 0's momentumTerm = np.zeros(x_proc.shape) # We do the actual update as a sum of the step size times current gradient and the momentum term x_proc = x_proc + alpha * (dTP1) + momentumTerm errors[iters] = errorCalcs.getErrorMethod(A_proc, b_proc, x_proc, probDist, errorType, norm=2) iters += 1 return x_proc, errors