Exemple #1
0
    def __init__(self, args, sc):

        self.EPSILON = 1.0e-5

        self.ctx = sc

        self.numPartitions = args.partitions

        self.numIterations = args.iterations
        self.inputVectorPath = args.inputVector
        self.inputMatrixPath = args.inputMatrix
        self.outputVectorPath = args.outputVector

        # Read Matrix input data
        # inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)

        if (self.numPartitions != 0):
            inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\
                .map(lambda line: IndexedRow(line[0], line[1]))\
                .repartition(self.numPartitions)
        else:
            inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\
                .map(lambda line: IndexedRow(line[0], line[1]))

        self.inputMatrix = IndexedRowMatrix(inputMatrixData)

        self.inputVector = readVector(self.inputVectorPath, self.ctx)

        if (self.numIterations == 0):
            self.numIterations = self.inputVector.size * 2

        self.result = Vectors.zeros(self.inputVector.size)
Exemple #2
0
    def DGEMV(alpha, A, x, beta, y, jsc):

        # First form y:= beta * y.
        if (beta != 1.0):
            if (beta == 0.0):
                y = Vectors.zeros(y.size)

        else:
            y = beta * y

        if (alpha == 0.0):
            return y

        broadcastVector = jsc.broadcast(x)
        broadcastAlpha = jsc.broadcast(alpha)

        result = A.rows.map(lambda currentRow: L2.MultiplyRows(currentRow.index,
                                                                 broadcastAlpha.value,
                                                                 currentRow.vector,
                                                                 broadcastVector.value))\
            .sortByKey()\
            .values()\
            .collect()

        resultVector = DenseVector(result)

        y = y + resultVector

        return y
Exemple #3
0
 def exactSolverCMU(self, D1_Cprod, b0, ChainProduct2, tolerance):
     q = -int(math.ceil(math.log(tolerance)))
     Chi = self.matrixVectorMultiply(D1_Cprod, b0)
     n = int(ChainProduct2.numRows())
     y = Vectors.zeros(n)
     for k in range(q - 1):
         temp = self.matrixVectorMultiply(ChainProduct2, y)
         y = y - temp + Chi
     logging.warn('ExactSover: Done returning')
     return y
def centroid(model, data, sc, vector_size):
    if len(data) == 0:
        print("All data points are not in vocab")
        print(vector_size)
        from pyspark.mllib.linalg import Vectors
        return Vectors.dense(Vectors.zeros(vector_size))

    vectorize_list = list(map(model.transform, data))
    centroid = sc.parallelize(vectorize_list).reduce(lambda x, y: x + y)
    centroid = centroid / len(vectorize_list)
    return centroid
Exemple #5
0
    def __init__(self, args, sc):
        self.ctx = sc

        self.numPartitions = args.partitions

        self.inputVectorPath = args.inputVector
        self.inputMatrixPath = args.inputMatrix
        self.outputVectorPath = args.outputVector

        self.alpha = args.alpha
        self.beta = args.beta

        # Read Matrix input data
        # inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)

        if (self.numPartitions != 0):
            inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\
                .map(lambda line: IndexedRow(line[0], line[1]))\
                .repartition(self.numPartitions)
        else:
            inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\
                .map(lambda line: IndexedRow(line[0], line[1]))

        print "Number of rows in Matrix with type" + str(type(inputMatrixData)) + " is: " + str(inputMatrixData.count())

        # PipelinedRDD to RDD
        # newData = sc.parallelize(inputMatrixData.collect())

        inputMatrix = IndexedRowMatrix(inputMatrixData)

        inputVector = readVector(self.inputVectorPath, self.ctx)

        print "Vector size is: " + str(inputVector.size)

        result = Vectors.zeros(inputVector.size)

        # print result

        # DGEMV(alpha, A, x, beta, y, jsc):
        result = L2.DGEMV(self.alpha, inputMatrix, inputVector, self.beta, result, self.ctx)

        # writeVector(self.outputVectorPath, result)

        printVector(result)
Exemple #6
0
    def solve(self):
        # print result

        stop = False

        start = time.clock()

        r = np.copy(self.inputVector)

        Ap = Vectors.zeros(self.inputMatrix.numRows())

        # p = r
        p = np.copy(r)

        # rsold = r * r

        rsold = r.dot(r)

        rsold = r.dot(r)

        alpha = 0.0

        rsnew = 0.0

        k = 0

        while (not stop):

            # Inicio -- Ap=A * p
            Ap = L2.DGEMV(1.0, self.inputMatrix, p, 0.0, Ap, self.ctx)

            # Fin -- Ap=A * p

            # alpha=rsold / (p'*Ap)
            alpha = rsold / p.dot(Ap);

            # x=x+alpha * p
            self.result = self.result + alpha*p

            # r=r-alpha * Ap
            r = r - alpha*Ap

            # rsnew = r'*r
            rsnew = r.dot(r)

            if ((math.sqrt(rsnew) <= self.EPSILON) or (k >= (self.numIterations))):
                stop = True

            # p=r+rsnew / rsold * p
            p = r + (rsnew/rsold) * p

            rsold = rsnew

            k += 1

        # FIN GRADIENTE CONJUGADO

        end = time.clock()

        print "Total time in solve system is: " + str(end - start) + " and " + str(k) + " iterations."

        printVector(self.result)

        return self.result
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import StreamingLinearRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
# Create a local StreamingContext with two working thread and batch interval of 1 second
sc = SparkContext("local[2]", "Streaming online learning")
ssc = StreamingContext(sc, 10)

stream = ssc.socketTextStream("localhost", 9999)

numFeatures = 100

zeroVector=Vectors.zeros(numFeatures)

model = StreamingLinearRegressionWithSGD(stepSize=0.01,numIterations=1)
model.setInitialWeights(Vectors.dense([0]*numFeatures))


#labeledStream=stream.map(lambda line: line.split('\t')).map(lambda fields: LabeledPoint(float(fields[0]),fields[1].map(lambda line: line.split(',')).map(float())))

def parsePoint(line):
    values = [float(x) for x in line.split(',')]
    return LabeledPoint(label=values[0], features=Vectors.dense(values[1:]))

labeledStream=stream.map(lambda line: parsePoint(line))

model.trainOn(labeledStream)