def __init__(self, args, sc): self.EPSILON = 1.0e-5 self.ctx = sc self.numPartitions = args.partitions self.numIterations = args.iterations self.inputVectorPath = args.inputVector self.inputMatrixPath = args.inputMatrix self.outputVectorPath = args.outputVector # Read Matrix input data # inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx) if (self.numPartitions != 0): inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\ .map(lambda line: IndexedRow(line[0], line[1]))\ .repartition(self.numPartitions) else: inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\ .map(lambda line: IndexedRow(line[0], line[1])) self.inputMatrix = IndexedRowMatrix(inputMatrixData) self.inputVector = readVector(self.inputVectorPath, self.ctx) if (self.numIterations == 0): self.numIterations = self.inputVector.size * 2 self.result = Vectors.zeros(self.inputVector.size)
def DGEMV(alpha, A, x, beta, y, jsc): # First form y:= beta * y. if (beta != 1.0): if (beta == 0.0): y = Vectors.zeros(y.size) else: y = beta * y if (alpha == 0.0): return y broadcastVector = jsc.broadcast(x) broadcastAlpha = jsc.broadcast(alpha) result = A.rows.map(lambda currentRow: L2.MultiplyRows(currentRow.index, broadcastAlpha.value, currentRow.vector, broadcastVector.value))\ .sortByKey()\ .values()\ .collect() resultVector = DenseVector(result) y = y + resultVector return y
def exactSolverCMU(self, D1_Cprod, b0, ChainProduct2, tolerance): q = -int(math.ceil(math.log(tolerance))) Chi = self.matrixVectorMultiply(D1_Cprod, b0) n = int(ChainProduct2.numRows()) y = Vectors.zeros(n) for k in range(q - 1): temp = self.matrixVectorMultiply(ChainProduct2, y) y = y - temp + Chi logging.warn('ExactSover: Done returning') return y
def centroid(model, data, sc, vector_size): if len(data) == 0: print("All data points are not in vocab") print(vector_size) from pyspark.mllib.linalg import Vectors return Vectors.dense(Vectors.zeros(vector_size)) vectorize_list = list(map(model.transform, data)) centroid = sc.parallelize(vectorize_list).reduce(lambda x, y: x + y) centroid = centroid / len(vectorize_list) return centroid
def __init__(self, args, sc): self.ctx = sc self.numPartitions = args.partitions self.inputVectorPath = args.inputVector self.inputMatrixPath = args.inputMatrix self.outputVectorPath = args.outputVector self.alpha = args.alpha self.beta = args.beta # Read Matrix input data # inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx) if (self.numPartitions != 0): inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\ .map(lambda line: IndexedRow(line[0], line[1]))\ .repartition(self.numPartitions) else: inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\ .map(lambda line: IndexedRow(line[0], line[1])) print "Number of rows in Matrix with type" + str(type(inputMatrixData)) + " is: " + str(inputMatrixData.count()) # PipelinedRDD to RDD # newData = sc.parallelize(inputMatrixData.collect()) inputMatrix = IndexedRowMatrix(inputMatrixData) inputVector = readVector(self.inputVectorPath, self.ctx) print "Vector size is: " + str(inputVector.size) result = Vectors.zeros(inputVector.size) # print result # DGEMV(alpha, A, x, beta, y, jsc): result = L2.DGEMV(self.alpha, inputMatrix, inputVector, self.beta, result, self.ctx) # writeVector(self.outputVectorPath, result) printVector(result)
def solve(self): # print result stop = False start = time.clock() r = np.copy(self.inputVector) Ap = Vectors.zeros(self.inputMatrix.numRows()) # p = r p = np.copy(r) # rsold = r * r rsold = r.dot(r) rsold = r.dot(r) alpha = 0.0 rsnew = 0.0 k = 0 while (not stop): # Inicio -- Ap=A * p Ap = L2.DGEMV(1.0, self.inputMatrix, p, 0.0, Ap, self.ctx) # Fin -- Ap=A * p # alpha=rsold / (p'*Ap) alpha = rsold / p.dot(Ap); # x=x+alpha * p self.result = self.result + alpha*p # r=r-alpha * Ap r = r - alpha*Ap # rsnew = r'*r rsnew = r.dot(r) if ((math.sqrt(rsnew) <= self.EPSILON) or (k >= (self.numIterations))): stop = True # p=r+rsnew / rsold * p p = r + (rsnew/rsold) * p rsold = rsnew k += 1 # FIN GRADIENTE CONJUGADO end = time.clock() print "Total time in solve system is: " + str(end - start) + " and " + str(k) + " iterations." printVector(self.result) return self.result
from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.mllib.linalg import Vectors from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.regression import StreamingLinearRegressionWithSGD from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel # Create a local StreamingContext with two working thread and batch interval of 1 second sc = SparkContext("local[2]", "Streaming online learning") ssc = StreamingContext(sc, 10) stream = ssc.socketTextStream("localhost", 9999) numFeatures = 100 zeroVector=Vectors.zeros(numFeatures) model = StreamingLinearRegressionWithSGD(stepSize=0.01,numIterations=1) model.setInitialWeights(Vectors.dense([0]*numFeatures)) #labeledStream=stream.map(lambda line: line.split('\t')).map(lambda fields: LabeledPoint(float(fields[0]),fields[1].map(lambda line: line.split(',')).map(float()))) def parsePoint(line): values = [float(x) for x in line.split(',')] return LabeledPoint(label=values[0], features=Vectors.dense(values[1:])) labeledStream=stream.map(lambda line: parsePoint(line)) model.trainOn(labeledStream)