Ejemplos de DenseVector en Python, ejemplos de pyspark.mllib.linalg.DenseVector en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: test_linalg.py Proyecto: drewrobb/spark

 def test_dot(self):
     from scipy.sparse import lil_matrix
     lil = lil_matrix((4, 1))
     lil[1, 0] = 1
     lil[3, 0] = 2
     dv = DenseVector(array([1., 2., 3., 4.]))
     self.assertEqual(10.0, dv.dot(lil))

Ejemplo n.º 2

0

Mostrar archivo

Archivo: engine.py Proyecto: KevinDocel/bigdata_pingxin

    def get_ratings(self, res_id, ratings, top):
        if res_id not in self.models.keys():
            logger.info("Keys: " + str(self.models.keys()))
            logger.info("Key Type: " + str(type(self.models.keys()[0])))
            logger.info("res_id: " + str(res_id))
            logger.info("res_id type:" + str(type(res_id)))
            logger.info("res_id not known")
            return []
        
        pf = self.models[res_id].productFeatures()
         
        user_pf = pf.filter(lambda x: x[0] in ratings)
        if len(user_pf.collect()) == 0:
            logger.info("No product matches")
            return []
        user_f = user_pf.collect()
        tmp = DenseVector(user_f[0][1])
        for i in xrange(1, len(user_f)):
            tmp = tmp + user_f[i][1]
        #user_f = user_pf.reduce(lambda x, y : DenseVector(x[1]) + DenseVector(y[1]))
        estimate_score = pf.map(lambda x: (x[0], tmp.dot(DenseVector(x[1])))).filter(lambda x: x[0] not in ratings).takeOrdered(top, lambda (k,v): -v)
 
        #estimate_score = pf.map(lambda x: (x[0], DenseVector(user_f).dot(DenseVector(x[1])))).filter(lambda x: x[0] not in ratings).takeOrdered(top, lambda (k,v): -v)
        estimate_pid = map(lambda x: x[0], estimate_score)
        
        return estimate_pid

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test_linalg.py Proyecto: drewrobb/spark

 def test_squared_distance(self):
     from scipy.sparse import lil_matrix
     lil = lil_matrix((4, 1))
     lil[1, 0] = 3
     lil[3, 0] = 2
     dv = DenseVector(array([1., 2., 3., 4.]))
     sv = SparseVector(4, {0: 1, 1: 2, 2: 3, 3: 4})
     self.assertEqual(15.0, dv.squared_distance(lil))
     self.assertEqual(15.0, sv.squared_distance(lil))

Ejemplo n.º 4

0

Mostrar archivo

    def test_norms(self):
        a = DenseVector([0, 2, 3, -1])
        self.assertAlmostEqual(a.norm(2), 3.742, 3)
        self.assertTrue(a.norm(1), 6)
        self.assertTrue(a.norm(inf), 3)
        a = SparseVector(4, [0, 2], [3, -4])
        self.assertAlmostEqual(a.norm(2), 5)
        self.assertTrue(a.norm(1), 7)
        self.assertTrue(a.norm(inf), 4)

        tmp = SparseVector(4, [0, 2], [3, 0])
        self.assertEqual(tmp.numNonzeros(), 1)

Ejemplo n.º 5

0

Mostrar archivo

class VectorUDTTests(MLlibTestCase):

    dv0 = DenseVector([])
    dv1 = DenseVector([1.0, 2.0])
    sv0 = SparseVector(2, [], [])
    sv1 = SparseVector(2, [1], [2.0])
    udt = VectorUDT()

    def test_json_schema(self):
        self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt)

    def test_serialization(self):
        for v in [self.dv0, self.dv1, self.sv0, self.sv1]:
            self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v)))

    def test_infer_schema(self):
        rdd = self.sc.parallelize([LabeledPoint(1.0, self.dv1), LabeledPoint(0.0, self.sv1)])
        df = rdd.toDF()
        schema = df.schema
        field = [f for f in schema.fields if f.name == "features"][0]
        self.assertEqual(field.dataType, self.udt)
        vectors = df.rdd.map(lambda p: p.features).collect()
        self.assertEqual(len(vectors), 2)
        for v in vectors:
            if isinstance(v, SparseVector):
                self.assertEqual(v, self.sv1)
            elif isinstance(v, DenseVector):
                self.assertEqual(v, self.dv1)
            else:
                raise TypeError("expecting a vector but got %r of type %r" % (v, type(v)))

    def test_row_matrix_from_dataframe(self):
        from pyspark.sql.utils import IllegalArgumentException
        df = self.spark.createDataFrame([Row(Vectors.dense(1))])
        row_matrix = RowMatrix(df)
        self.assertEqual(row_matrix.numRows(), 1)
        self.assertEqual(row_matrix.numCols(), 1)
        with self.assertRaises(IllegalArgumentException):
            RowMatrix(df.selectExpr("'monkey'"))

    def test_indexed_row_matrix_from_dataframe(self):
        from pyspark.sql.utils import IllegalArgumentException
        df = self.spark.createDataFrame([Row(int(0), Vectors.dense(1))])
        matrix = IndexedRowMatrix(df)
        self.assertEqual(matrix.numRows(), 1)
        self.assertEqual(matrix.numCols(), 1)
        with self.assertRaises(IllegalArgumentException):
            IndexedRowMatrix(df.drop("_1"))

Ejemplo n.º 6

0

Mostrar archivo

    def DGEMV(alpha, A, x, beta, y, jsc):

        # First form y:= beta * y.
        if (beta != 1.0):
            if (beta == 0.0):
                y = Vectors.zeros(y.size)

        else:
            y = beta * y

        if (alpha == 0.0):
            return y

        broadcastVector = jsc.broadcast(x)
        broadcastAlpha = jsc.broadcast(alpha)

        result = A.rows.map(lambda currentRow: L2.MultiplyRows(currentRow.index,
                                                                 broadcastAlpha.value,
                                                                 currentRow.vector,
                                                                 broadcastVector.value))\
            .sortByKey()\
            .values()\
            .collect()

        resultVector = DenseVector(result)

        y = y + resultVector

        return y

Ejemplo n.º 7

0

Mostrar archivo

 def test_list(self):
     l = [0, 1]
     for lst_like in [l, np.array(l), DenseVector(l), SparseVector(len(l), range(len(l)), l),
                      array.array('l', l), xrange(2), tuple(l)]:
         converted = TypeConverters.toList(lst_like)
         self.assertEqual(type(converted), list)
         self.assertListEqual(converted, l)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: sparktest.py Proyecto: peterbaouoft/spark-small-trial

def train_transform_func(vector):
    # Remap the value to elimiate NAN result for ease of calculation
    new_vec = DenseVector.toArray().map(lambda x: 0 if math.isnan(x) else x)
    arimaModel = ARIMA.fit_model(1, 0, 0, new_vec)
    forecasted = arimaModel.forecast(new_vec, 5)  # 5 days for predict
    print(type(forecasted))
    exit(0)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: transformers.py Proyecto: spriter2013/python

    def _transform(self, row):
        """Transforms the sparse vector to a dense vector while putting it in a new column."""
        sparse_vector = row[self.input_column]
        dense_vector = DenseVector(sparse_vector.toArray())
        new_row = new_dataframe_row(row, self.output_column, dense_vector)

        return new_row

Ejemplo n.º 10

0

Mostrar archivo

Archivo: anomaly_detection.py Proyecto: gitofsid/MyBigDataCode

    def cat2Num(self, df, indices):
        '''sbaronia - extract the categorical data and make df out of it
        so oneHotEncoding can be run on them'''
        protocol_ind0 = df.select(df.id,df.rawFeatures[indices[0]].alias("features0")).cache()
        protocol_ind1 = df.select(df.id,df.rawFeatures[indices[1]].alias("features1")).cache()

        ind0_enc = self.oneHotEncoding(protocol_ind0,"features0").cache()
        ind1_enc = self.oneHotEncoding(protocol_ind1,"features1").cache()
        
        '''sbaronia - add those hot encoded features columns to original df'''
        int_join_1 = df.join(ind0_enc, ind0_enc.id == df.id, 'inner').drop(ind0_enc.id).cache()
        int_join_2 = int_join_1.join(ind1_enc, int_join_1.id == ind1_enc.id, 'inner').drop(int_join_1.id).cache()

        '''sbaronia - now create a new column features which has 
        converted vector form and drop rest columns'''
        comb_udf = udf(replaceCat2Num,StringType())
        int_join_2 = int_join_2.select(int_join_2.id,int_join_2.rawFeatures, \
                                       comb_udf(int_join_2.rawFeatures, \
                                       int_join_2.num_features0, \
                                       int_join_2.num_features1).alias("features")).cache()
        
        '''sbaronia - convert list of numerical features to DenseVector
        so they can be used in KMeans'''
        dense_udf = udf(lambda line: DenseVector.parse(line), VectorUDT())
        feat = int_join_2.select(int_join_2.id,int_join_2.rawFeatures,dense_udf(int_join_2.features).alias("features")).cache()
      
        return feat

Ejemplo n.º 11

0

Mostrar archivo

 def test_load_vectors(self):
     import shutil
     data = [[1.0, 2.0, 3.0], [1.0, 2.0, 3.0]]
     temp_dir = tempfile.mkdtemp()
     load_vectors_path = os.path.join(temp_dir, "test_load_vectors")
     try:
         self.sc.parallelize(data).saveAsTextFile(load_vectors_path)
         ret_rdd = MLUtils.loadVectors(self.sc, load_vectors_path)
         ret = ret_rdd.collect()
         self.assertEqual(len(ret), 2)
         self.assertEqual(ret[0], DenseVector([1.0, 2.0, 3.0]))
         self.assertEqual(ret[1], DenseVector([1.0, 2.0, 3.0]))
     except:
         self.fail()
     finally:
         shutil.rmtree(load_vectors_path)

Ejemplo n.º 12

0

Mostrar archivo

Archivo: ML_lab3_linear_reg_student.py Proyecto: AkshatM/Spark_XSeries

def gradientSummand(weights, lp):
    """Calculates the gradient summand for a given weight and `LabeledPoint`.

    Note:
        `DenseVector` behaves similarly to a `numpy.ndarray` and they can be used interchangably
        within this function.  For example, they both implement the `dot` method.

    Args:
        weights (DenseVector): An array of model weights (betas).
        lp (LabeledPoint): The `LabeledPoint` for a single observation.

    Returns:
        DenseVector: An array of values the same length as `weights`.  The gradient summand.
    """
    return (weights.dot(DenseVector(lp.features)) - lp.label) * DenseVector(
        lp.features)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: tests.py Proyecto: bopopescu/SparkNew

 def test_model_setters(self):
     data = [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0]]
     model = StandardScaler().fit(self.sc.parallelize(data))
     self.assertIsNotNone(model.setWithMean(True))
     self.assertIsNotNone(model.setWithStd(True))
     self.assertEqual(model.transform([1.0, 2.0, 3.0]),
                      DenseVector([-1.0, -1.0, -1.0]))

Ejemplo n.º 14

0

Mostrar archivo

Archivo: content-based.py Proyecto: bharath000/Recommendation-Systems-Using-Spark-Hybrid-

def cosineSimilarity(candidateTfIdf):
    frequencyDenseVectors_1 = candidateTfIdf.map(
        lambda vector: DenseVector(vector.toArray()))
    y1 = frequencyDenseVectors_1.collect()
    re = frequencyDenseVectors_0.map(lambda x: (x.dot(y1[0])) /
                                     (x.norm(2) * y1[0].norm(2)))
    return re

Ejemplo n.º 15

0

Mostrar archivo

Archivo: pca.py Proyecto: phillette/PCA_with_MLlib

 def mapFn(row):
     pvals = []
     for predictor in predictors:
         predictor_index = lookup[predictor]
         if isinstance(dm[predictor], list):
             try:
                 encoded_val = dm[predictor].index(row[predictor_index])
                 if setToFlag == None:
                     pvals.append(encoded_val)
                 else:
                     flags = [0.0] * len(dm[predictor])
                     flags[encoded_val] = setToFlag
                     pvals += flags
             except ValueError:
                 if setToFlag == None:
                     pvals.append(None)
                 else:
                     pvals += [0.0] * len(dm[predictor])
         else:
             pval = row[predictor_index]
             # if pval == None:
             #    pval_min = dm[predictor]["min"]
             #    pval_max = dm[predictor]["max"]
             #    pval=pval_min+(pval_max - pval_min)*0.5
             pvals.append(pval)
     dv = DenseVector(pvals)
     if target_index == -1:
         return (row, dv)
     tval = row[target_index]
     if isinstance(dm[target], list):  # target is categorical
         try:
             tval = dm[target].index(tval)
         except ValueError:
             tval = None
     return (row, LabeledPoint(tval, dv))

Ejemplo n.º 16

0

Mostrar archivo

Archivo: NaiveSpark.py Proyecto: PrIyanshu-Bhatnagar/Bug-Report-Severity-Prediction

def parseLine(line):
    # Get Values
    label = line.severity
    # print('label: ', label)
    features = DenseVector(line.result)
    # print(features)
    return LabeledPoint(get_label_point(label), features)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: tests.py Proyecto: yunchat/spark

 def test_model_transform(self):
     data = [
         [1.0, 2.0, 3.0],
         [2.0, 3.0, 4.0],
         [3.0, 4.0, 5.0]
     ]
     model = StandardScaler().fit(self.sc.parallelize(data))
     self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([1.0, 2.0, 3.0]))

Ejemplo n.º 18

0

Mostrar archivo

Archivo: tests.py Proyecto: rajsingh7/spark

    def test_model_transform(self):
        weight = Vectors.dense([3, 2, 1])

        densevec = Vectors.dense([4, 5, 6])
        sparsevec = Vectors.sparse(3, [0], [1])
        eprod = ElementwiseProduct(weight)
        self.assertEqual(eprod.transform(densevec), DenseVector([12, 10, 6]))
        self.assertEqual(eprod.transform(sparsevec), SparseVector(3, [0], [3]))

Ejemplo n.º 19

0

Mostrar archivo

Archivo: distributed.py Proyecto: heihei2015/dist-keras

 def _predict(self, iterator):
     model = deserialize_keras_model(self.model)
     for row in iterator:
         X = np.asarray([row[self.features_column]])
         Y = model.predict(X)
         v = DenseVector(Y[0])
         new_row = new_dataframe_row(row, self.output_column, v)
         yield new_row

Ejemplo n.º 20

0

Mostrar archivo

 def test_list_int(self):
     for indices in [[1.0, 2.0], np.array([1.0, 2.0]), DenseVector([1.0, 2.0]),
                     SparseVector(2, {0: 1.0, 1: 2.0}), xrange(1, 3), (1.0, 2.0),
                     array.array('d', [1.0, 2.0])]:
         vs = VectorSlicer(indices=indices)
         self.assertListEqual(vs.getIndices(), [1, 2])
         self.assertTrue(all([type(v) == int for v in vs.getIndices()]))
     self.assertRaises(TypeError, lambda: VectorSlicer(indices=["a", "b"]))

Ejemplo n.º 21

0

Mostrar archivo

def parse_imr_w2v_vector(v_str):
    """
    creates a spark DenseVectorvectors from string lines
    :param v_str:
    :return:
    """
    from pyspark.mllib.linalg import DenseVector
    num_vec = map(float, v_str.split(';')[1:])  # TODO:? discard labels
    return DenseVector(num_vec)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: test_linalg.py Proyecto: Swidasya/spark-research

 def test_eq(self):
     v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
     v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
     v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
     v4 = SparseVector(6, [(1, 1.0), (3, 5.5)])
     v5 = DenseVector([0.0, 1.0, 0.0, 2.5])
     v6 = SparseVector(4, [(1, 1.0), (3, 2.5)])
     dm1 = DenseMatrix(2, 2, [2, 0, 0, 0])
     sm1 = SparseMatrix(2, 2, [0, 2, 3], [0], [2])
     self.assertEqual(v1, v2)
     self.assertEqual(v1, v3)
     self.assertFalse(v2 == v4)
     self.assertFalse(v1 == v5)
     self.assertFalse(v1 == v6)
     # this is done as Dense and Sparse matrices can be semantically
     # equal while still implementing a different __eq__ method
     self.assertEqual(dm1, sm1)
     self.assertEqual(sm1, dm1)

Ejemplo n.º 23

0

Mostrar archivo

Archivo: tests.py Proyecto: rajsingh7/spark

 def test_parse_vector(self):
     a = DenseVector([3, 4, 6, 7])
     self.assertTrue(str(a), '[3.0,4.0,6.0,7.0]')
     self.assertTrue(Vectors.parse(str(a)), a)
     a = SparseVector(4, [0, 2], [3, 4])
     self.assertTrue(str(a), '(4,[0,2],[3.0,4.0])')
     self.assertTrue(Vectors.parse(str(a)), a)
     a = SparseVector(10, [0, 1], [4, 5])
     self.assertTrue(SparseVector.parse(' (10, [0,1 ],[ 4.0,5.0] )'), a)

Ejemplo n.º 24

0

Mostrar archivo

Archivo: test_linalg.py Proyecto: drewrobb/spark

    def test_norms(self):
        a = DenseVector([0, 2, 3, -1])
        self.assertAlmostEqual(a.norm(2), 3.742, 3)
        self.assertTrue(a.norm(1), 6)
        self.assertTrue(a.norm(inf), 3)
        a = SparseVector(4, [0, 2], [3, -4])
        self.assertAlmostEqual(a.norm(2), 5)
        self.assertTrue(a.norm(1), 7)
        self.assertTrue(a.norm(inf), 4)

        tmp = SparseVector(4, [0, 2], [3, 0])
        self.assertEqual(tmp.numNonzeros(), 1)

Ejemplo n.º 25

0

Mostrar archivo

 def test_squared_distance(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1., 2., 3., 4.]))
     lst = DenseVector([4, 3, 2, 1])
     lst1 = [4, 3, 2, 1]
     arr = pyarray.array('d', [0, 2, 1, 3])
     narr = array([0, 2, 1, 3])
     self.assertEqual(15.0, _squared_distance(sv, dv))
     self.assertEqual(25.0, _squared_distance(sv, lst))
     self.assertEqual(20.0, _squared_distance(dv, lst))
     self.assertEqual(15.0, _squared_distance(dv, sv))
     self.assertEqual(25.0, _squared_distance(lst, sv))
     self.assertEqual(20.0, _squared_distance(lst, dv))
     self.assertEqual(0.0, _squared_distance(sv, sv))
     self.assertEqual(0.0, _squared_distance(dv, dv))
     self.assertEqual(0.0, _squared_distance(lst, lst))
     self.assertEqual(25.0, _squared_distance(sv, lst1))
     self.assertEqual(3.0, _squared_distance(sv, arr))
     self.assertEqual(3.0, _squared_distance(sv, narr))

Ejemplo n.º 26

0

Mostrar archivo

Archivo: utilfunctions.py Proyecto: mveerara/Coursera-Big-Data-UCSD

def computeCost(featuresAndPrediction, model):
    allClusterCenters = [DenseVector(c) for c in model.clusterCenters()]
    arrayCollection   = featuresAndPrediction.rdd.map(array)

    def error(point, predictedCluster):
        center = allClusterCenters[predictedCluster]
        z      = point - center
        return sqrt((z*z).sum())
    
    return arrayCollection.map(lambda row: error(row[0], row[1])).reduce(lambda x, y: x + y)

Ejemplo n.º 27

0

Mostrar archivo

Archivo: tests.py Proyecto: bopopescu/SparkNew

 def test_dot(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1., 2., 3., 4.]))
     lst = DenseVector([1, 2, 3, 4])
     mat = array([[1., 2., 3., 4.], [1., 2., 3., 4.], [1., 2., 3., 4.],
                  [1., 2., 3., 4.]])
     self.assertEquals(10.0, sv.dot(dv))
     self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat)))
     self.assertEquals(30.0, dv.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
     self.assertEquals(30.0, lst.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))

Ejemplo n.º 28

0

Mostrar archivo

Archivo: webservice.py Proyecto: vikrambala/MachineLearningSamples-BigData

def processDf(df):
    mlSourceDF = df
    mlSourceDF.printSchema()
    mlSourceDF = mlSourceDF.fillna(
        0, subset=[x for x in mlSourceDF.columns if 'Lag' in x])
    mlSourceDF = mlSourceDF.na.drop(
        subset=["ServerIP", "SessionStartHourTime"])
    columnsForIndex = [
        'dayofweek', 'ServerIP', 'year', 'month', 'weekofyear', 'dayofmonth',
        'hourofday', 'Holiday', 'BusinessHour', 'Morning'
    ]
    mlSourceDF = mlSourceDF.fillna(0, subset=[x for x in columnsForIndex])
    scoreDF = mlSourceDF

    # indexing
    scoreDF = indexModel.transform(scoreDF)
    # encoding
    scoreDFCat = ohPipelineModel.transform(scoreDF)

    # feature scaling
    featuresForScale = [x for x in scoreDFCat.columns if 'Lag' in x]
    assembler = VectorAssembler(inputCols=featuresForScale,
                                outputCol="features")
    assembled = assembler.transform(scoreDFCat).select('key', 'features')
    scaledData = scaler.transform(assembled).select('key', 'scaledFeatures')

    def extract(row):
        return (row.key, ) + tuple(float(x) for x in row.scaledFeatures.values)

    from pyspark.sql.types import Row
    from pyspark.mllib.linalg import DenseVector
    rdd = scaledData.rdd.map(
        lambda x: Row(key=x[0], scaledFeatures=DenseVector(x[1].toArray())))
    scaledDf = rdd.map(extract).toDF(["key"])
    # rename columns
    oldColumns = scaledDf.columns
    scaledColumns = ['scaledKey']
    scaledColumns.extend(['scaled' + str(i) for i in featuresForScale])
    scaledOutcome = scaledDf.select([
        col(oldColumns[index]).alias(scaledColumns[index])
        for index in range(0, len(oldColumns))
    ])
    scaledOutcome.show(1)
    scaledOutcome.cache()
    noScaledMLSourceDF = scoreDFCat.select([
        column for column in scoreDFCat.columns
        if column not in featuresForScale
    ])
    noScaledMLSourceDF.cache()
    noScaledMLSourceDF.printSchema()
    scaledOutcome.printSchema()
    newDF = noScaledMLSourceDF.join(
        scaledOutcome, (noScaledMLSourceDF.key == scaledOutcome.scaledKey),
        'outer')
    return newDF

Ejemplo n.º 29

0

Mostrar archivo

Archivo: utils.py Proyecto: zerocurve/dist-keras

def to_dense_vector(value, n_dim=2):
    """Converts the value to a one-hot encoded vector.

    # Arguments
        value: float. Value of the single "hot" value.
        n_dim: int. Dimension of the output vector.
    """
    vector = np.zeros(n_dim)
    vector[value] = 1.0

    return DenseVector(vector)

Ejemplo n.º 30

0

Mostrar archivo

Archivo: __init__.py Proyecto: zhenzixia/spark

 def toVector(value):
     """
     Convert a value to a MLlib Vector, if possible.
     """
     if isinstance(value, Vector):
         return value
     elif TypeConverters._can_convert_to_list(value):
         value = TypeConverters.toList(value)
         if all(map(lambda v: TypeConverters._is_numeric(v), value)):
             return DenseVector(value)
     raise TypeError("Could not convert %s to vector" % value)

Ejemplo n.º 31

0

Mostrar archivo

Archivo: distributed.py Proyecto: deepminder/dist-keras

    def _transform(self, iterator):
        rows = []
        try:
            for row in iterator:
                label = row[self.input_column]
                v = DenseVector(to_vector(label, self.output_dim).tolist())
                new_row = new_dataframe_row(row, self.output_column, v)
                rows.append(new_row)
        except TypeError:
            pass

        return iter(rows)

Ejemplo n.º 32

0

Mostrar archivo

Archivo: ML_lab3_linear_reg.py Proyecto: jordancheah/Python

def getLabeledPrediction(weights, observation):
    """Calculates predictions and returns a (label, prediction) tuple.

    Note:
        The labels should remain unchanged as we'll use this information to calculate prediction
        error later.

    Args:
        weights (np.ndarray): An array with one weight for each features in `trainData`.
        observation (LabeledPoint): A `LabeledPoint` that contain the correct label and the
            features for the data point.

    Returns:
        tuple: A (label, prediction) tuple.
    """
    label = observation.label
    features = DenseVector(observation.features)
    weights = DenseVector(weights)
    prediction = DenseVector.dot(weights, features)
    result = (label, prediction)
    return result

Ejemplo n.º 33

0

Mostrar archivo

Archivo: ML_lab3_linear_reg.py Proyecto: jordancheah/Python

def linregGradientDescent(trainData, numIters):
    """Calculates the weights and error for a linear regression model trained with gradient descent.
    Note:
        `DenseVector` behaves similarly to a `numpy.ndarray` and they can be used interchangably
        within this function.  For example, they both implement the `dot` method.
    Args:
        trainData (RDD of LabeledPoint): The labeled data for use in training the model.
        numIters (int): The number of iterations of gradient descent to perform.
    Returns:
        (np.ndarray, np.ndarray): A tuple of (weights, training errors).  Weights will be the
            final weights (one weight per feature) for the model, and training errors will contain
            an error (RMSE) for each iteration of the algorithm.
    """
    # The length of the training data
    n = trainData.count()
    # The number of features in the training data
    d = len(trainData.take(1)[0].features)
    w = np.zeros(d)
    w = DenseVector(w)
    alpha = 1.0
    # We will compute and store the training error after each iteration
    errorTrain = np.zeros(numIters)
    for i in range(numIters):
        # Use getLabeledPrediction from (3b) with trainData to obtain an RDD of (label, prediction)
        # tuples.  Note that the weights all equal 0 for the first iteration, so the predictions will
        # have large errors to start.
        labelsAndPredsTrain = trainData.map(
            lambda lp: getLabeledPrediction(w, lp))
        errorTrain[i] = calcRMSE(labelsAndPredsTrain)

        # Calculate the `gradient`.  Make use of the `gradientSummand` function you wrote in (3a).
        # Note that `gradient` sould be a `DenseVector` of length `d`.
        gradient = trainData.map(lambda lp: gradientSummand(w, lp)).reduce(
            lambda a, b: a + b)
        gradient = DenseVector(gradient)
        #gradient = trainData.map(lambda lp: gradientSummand(w,lp)).reduce(lambda one, two: one+two);
        # Update the weights
        alpha_i = alpha / (n * np.sqrt(i + 1))
        w -= alpha_i * gradient
    return w, errorTrain

Ejemplo n.º 34

0

Mostrar archivo

    def fonction(line):

        # Parsing csv line
        values = list(csv.reader(StringIO(line)))[0]
        if is_train:
            PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked = values
        else:
            PassengerId, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked = values

        # Sex
        sex_code = 0 if Sex == "male" else 1

        # Age
        safe_age = float(Age) if Age.isdigit() else mean_age
        try:
            safe_fare = float(Fare)
        except:
            safe_fare = mean_fare

        # Embarked
        safe_embarked = Embarked if Embarked else most_frequent_embarked
        code_embarked = 0 if safe_embarked == 'S' else 1 if safe_embarked == 'C' else 2  # == 'Q'

        # Title
        title = Name.split(",")[1].split(".")[0]
        code_title = 0
        if title == "Mr":
            code_title = 1
        elif title == "Miss":
            code_title = 2
        elif title == "Mrs":
            code_title = 3
        elif title == "Master":
            code_title = 4

        # Pclass code
        code_pclass = int(Pclass) - 1

        # child
        child_flag = 1 if safe_age <= 6 else 0

        features_vector = DenseVector([
            sex_code, safe_age, code_pclass,
            int(Parch),
            int(SibSp), safe_fare, code_embarked, code_title, child_flag,
            int(Parch)
        ])

        if is_train:
            return LabeledPoint(float(Survived), features_vector)
        else:
            return features_vector

Ejemplo n.º 35

0

Mostrar archivo

Archivo: transformers.py Proyecto: spriter2013/python

    def _transform(self, row):
        """Appends the desired binary label column."""
        value = row[self.input_column]
        vector = np.zeros(2)
        # Check if the name matches.
        if value == self.label:
            vector[0] = 1.0
        else:
            vector[1] = 1.0
        # Convert to a Spark DenseVector
        vector = DenseVector(vector)

        return new_dataframe_row(row, self.output_column, vector)

Ejemplo n.º 36

0

Mostrar archivo

Archivo: ML_lab3_linear_reg_student.py Proyecto: chengwliu/MOOC

def gradientSummand(weights, lp):
    """Calculates the gradient summand for a given weight and `LabeledPoint`.

    Note:
        `DenseVector` behaves similarly to a `numpy.ndarray` and they can be used interchangably
        within this function.  For example, they both implement the `dot` method.

    Args:
        weights (DenseVector): An array of model weights (betas).
        lp (LabeledPoint): The `LabeledPoint` for a single observation.

    Returns:
        DenseVector: An array of values the same length as `weights`.  The gradient summand.
    """
    return (DenseVector.dot(lp.features, weights) - lp.label) * lp.features

Ejemplo n.º 37

0

Mostrar archivo

Archivo: ML_lab3_linear_reg_student.py Proyecto: chengwliu/MOOC

def getLabeledPrediction(weights, observation):
    """Calculates predictions and returns a (label, prediction) tuple.

    Note:
        The labels should remain unchanged as we'll use this information to calculate prediction
        error later.

    Args:
        weights (np.ndarray): An array with one weight for each features in `trainData`.
        observation (LabeledPoint): A `LabeledPoint` that contain the correct label and the
            features for the data point.

    Returns:
        tuple: A (label, prediction) tuple.
    """
    return (observation.label, DenseVector.dot(observation.features, weights))

Ejemplo n.º 38

0

Mostrar archivo

Archivo: tests.py Proyecto: vidur89/spark

 def test_dot(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1.0, 2.0, 3.0, 4.0]))
     lst = DenseVector([1, 2, 3, 4])
     mat = array([[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]])
     self.assertEquals(10.0, sv.dot(dv))
     self.assertTrue(array_equal(array([3.0, 6.0, 9.0, 12.0]), sv.dot(mat)))
     self.assertEquals(30.0, dv.dot(dv))
     self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), dv.dot(mat)))
     self.assertEquals(30.0, lst.dot(dv))
     self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), lst.dot(mat)))

Ejemplo n.º 39

0

Mostrar archivo

Archivo: test_linalg.py Proyecto: drewrobb/spark

 def test_dot(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1., 2., 3., 4.]))
     lst = DenseVector([1, 2, 3, 4])
     mat = array([[1., 2., 3., 4.],
                  [1., 2., 3., 4.],
                  [1., 2., 3., 4.],
                  [1., 2., 3., 4.]])
     arr = pyarray.array('d', [0, 1, 2, 3])
     self.assertEqual(10.0, sv.dot(dv))
     self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat)))
     self.assertEqual(30.0, dv.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
     self.assertEqual(30.0, lst.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
     self.assertEqual(7.0, sv.dot(arr))

Ejemplo n.º 40

0

Mostrar archivo

Archivo: ML_lab1_review_student.py Proyecto: auputiger/ScalableML_Berkeley

# #### Note that `DenseVector` stores all values as `np.float64`, so even if you pass in an NumPy array of integers, the resulting `DenseVector` will contain floating-point numbers. Also, `DenseVector` objects exist locally and are not inherently distributed.  `DenseVector` objects can be used in the distributed setting by either passing functions that contain them to resilient distributed dataset (RDD) transformations or by distributing them directly as RDDs.  You'll learn more about RDDs in the spark tutorial.
# #### For this exercise, create a `DenseVector` consisting of the values `[3.0, 4.0, 5.0]` and compute the dot product of this vector with `numpyVector`.

# In[28]:

from pyspark.mllib.linalg import DenseVector


# In[31]:

# TODO: Replace <FILL IN> with appropriate code
numpyVector = np.array([-3, -4, 5])
print "\nnumpyVector:\n{0}".format(numpyVector)

# Create a DenseVector consisting of the values [3.0, 4.0, 5.0]
myDenseVector = DenseVector([3.0, 4.0, 5.0])  # <FILL IN>
# Calculate the dot product between the two vectors.
denseDotProduct = myDenseVector.dot(DenseVector(numpyVector))  # <FILL IN>

print "myDenseVector:\n{0}".format(myDenseVector)
print "\ndenseDotProduct:\n{0}".format(denseDotProduct)


# In[32]:

# TEST PySpark's DenseVector (3c)
Test.assertTrue(isinstance(myDenseVector, DenseVector), "myDenseVector is not a DenseVector")
Test.assertTrue(np.allclose(myDenseVector, np.array([3.0, 4.0, 5.0])), "incorrect value for myDenseVector")
Test.assertTrue(np.allclose(denseDotProduct, 0.0), "incorrect value for denseDotProduct")

Ejemplo n.º 41

0

Mostrar archivo

Archivo: ML_lab1_review_student.py Proyecto: navink/Apache-Spark_CS190.1x

# #### Note that `DenseVector` stores all values as `np.float64`, so even if you pass in an NumPy array of integers, the resulting `DenseVector` will contain floating-point numbers. Also, `DenseVector` objects exist locally and are not inherently distributed.  `DenseVector` objects can be used in the distributed setting by either passing functions that contain them to resilient distributed dataset (RDD) transformations or by distributing them directly as RDDs.  You'll learn more about RDDs in the spark tutorial.
# #### For this exercise, create a `DenseVector` consisting of the values `[3.0, 4.0, 5.0]` and compute the dot product of this vector with `numpyVector`.

# In[22]:

from pyspark.mllib.linalg import DenseVector


# In[25]:

# TODO: Replace <FILL IN> with appropriate code
numpyVector = np.array([-3, -4, 5])
print '\nnumpyVector:\n{0}'.format(numpyVector)

# Create a DenseVector consisting of the values [3.0, 4.0, 5.0]
myDenseVector = DenseVector([3.0, 4.0, 5.0])
# Calculate the dot product between the two vectors.
denseDotProduct = myDenseVector.dot(numpyVector)

print 'myDenseVector:\n{0}'.format(myDenseVector)
print '\ndenseDotProduct:\n{0}'.format(denseDotProduct)


# In[26]:

# TEST PySpark's DenseVector (3c)
Test.assertTrue(isinstance(myDenseVector, DenseVector), 'myDenseVector is not a DenseVector')
Test.assertTrue(np.allclose(myDenseVector, np.array([3., 4., 5.])),
                'incorrect value for myDenseVector')
Test.assertTrue(np.allclose(denseDotProduct, 0.0), 'incorrect value for denseDotProduct')

Ejemplo n.º 42

0

Mostrar archivo

Archivo: ML_lab1_review_student.py Proyecto: Wangbarros/Spark

# In[97]:

from pyspark.mllib.linalg import DenseVector


# In[98]:

# TODO: Replace <FILL IN> with appropriate code
numpyVector = np.array([-3, -4, 5])
print '\nnumpyVector:\n{0}'.format(numpyVector)

# Create a DenseVector consisting of the values [3.0, 4.0, 5.0]
myDenseVector = DenseVector(np.array([3.0,4.0,5.0]))
# Calculate the dot product between the two vectors.
denseDotProduct = DenseVector.dot(DenseVector(numpyVector),myDenseVector)

print 'myDenseVector:\n{0}'.format(myDenseVector)
print '\ndenseDotProduct:\n{0}'.format(denseDotProduct)


# In[99]:

# TEST PySpark's DenseVector (3c)
Test.assertTrue(isinstance(myDenseVector, DenseVector), 'myDenseVector is not a DenseVector')
Test.assertTrue(np.allclose(myDenseVector, np.array([3., 4., 5.])),
                'incorrect value for myDenseVector')
Test.assertTrue(np.allclose(denseDotProduct, 0.0), 'incorrect value for denseDotProduct')


# ### ** Part 4: Python lambda expressions **

Ejemplo n.º 43

0

Mostrar archivo

Archivo: ML_lab3_linear_reg.py Proyecto: bkamble/Python

expectedError = [79.72013547, 30.27835699,  9.27842641,  9.20967856,  9.19446483]
Test.assertTrue(np.allclose(exampleErrorTrain, expectedError),
                'value of exampleErrorTrain is incorrect')


# #### ** (3d) Train the model **
# #### Now let's train a linear regression model on all of our training data and evaluate its accuracy on the validation set.  Note that the test set will not be used here.  If we evaluated the model on the test set, we would bias our final results.
# #### We've already done much of the required work: we computed the number of features in Part (1b); we created the training and validation datasets and computed their sizes in Part (1e); and, we wrote a function to compute RMSE in Part (2b).

# In[44]:

# TODO: Replace <FILL IN> with appropriate code
numIters = 50
weightsLR0, errorTrainLR0 = linregGradientDescent(parsedTrainData, numIters);

labelsAndPreds = parsedValData.map(lambda lp: (lp.label,DenseVector.dot(weightsLR0,lp.features)))
rmseValLR0 = calcRMSE(labelsAndPreds)

print 'Validation RMSE:\n\tBaseline = {0:.3f}\n\tLR0 = {1:.3f}'.format(rmseValBase,
                                                                       rmseValLR0)


# In[45]:

# TEST Train the model (3d)
expectedOutput = [22.64535883, 20.064699, -0.05341901, 8.2931319, 5.79155768, -4.51008084,
                  15.23075467, 3.8465554, 9.91992022, 5.97465933, 11.36849033, 3.86452361]
Test.assertTrue(np.allclose(weightsLR0, expectedOutput), 'incorrect value for weightsLR0')


# #### ** Visualization 4: Training error **

Ejemplo n.º 44

0

Mostrar archivo

Archivo: ML_lab1_review_student.py Proyecto: nhonaitran/datamining

# In[29]:

from pyspark.mllib.linalg import DenseVector


# In[31]:

# TODO: Replace <FILL IN> with appropriate code
numpyVector = np.array([-3, -4, 5])
print '\nnumpyVector:\n{0}'.format(numpyVector)

# Create a DenseVector consisting of the values [3.0, 4.0, 5.0]
myDenseVector = DenseVector(np.array([3.0,4.0,5.0]))
# Calculate the dot product between the two vectors.
denseDotProduct = DenseVector.dot(myDenseVector, numpyVector)

print 'myDenseVector:\n{0}'.format(myDenseVector)
print '\ndenseDotProduct:\n{0}'.format(denseDotProduct)


# In[32]:

# TEST PySpark's DenseVector (3c)
Test.assertTrue(isinstance(myDenseVector, DenseVector), 'myDenseVector is not a DenseVector')
Test.assertTrue(np.allclose(myDenseVector, np.array([3., 4., 5.])),
                'incorrect value for myDenseVector')
Test.assertTrue(np.allclose(denseDotProduct, 0.0), 'incorrect value for denseDotProduct')


# ### ** Part 4: Python lambda expressions **

Ejemplo n.º 45

0

Mostrar archivo

Archivo: MachineLearning_Python.py Proyecto: aroonjham/CodeRepository

zeros = np.zeros(8) # returns an array of 8 0s [ 0.  0.  0.  0.  0.  0.  0.  0.]
ones = np.ones(8) # returns an array of 8 1s [ 1.  1.  1.  1.  1.  1.  1.  1.]
print 'zeros:\n{0}'.format(zeros)
print '\nones:\n{0}'.format(ones)

zerosThenOnes = np.hstack((zeros,ones))   #notice the "(("
# hstack will return [ 0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.  1.  1.  1.  1.]
zerosAboveOnes = np.vstack((zeros,ones))   # A 2 by 8 array 
# vstack in the above example will return 	[[ 0.  0.  0.  0.  0.  0.  0.  0.]
# 											[ 1.  1.  1.  1.  1.  1.  1.  1.]]

print '\nzerosThenOnes:\n{0}'.format(zerosThenOnes)
print '\nzerosAboveOnes:\n{0}'.format(zerosAboveOnes)

# When using PySpark, we use DenseVector instead of numpy vector. Example below:

from pyspark.mllib.linalg import DenseVector

numpyVector = np.array([-3, -4, 5])
print '\nnumpyVector:\n{0}'.format(numpyVector)

# Create a DenseVector consisting of the values [3.0, 4.0, 5.0]
myDenseVector = DenseVector([3.0, 4.0, 5.0])
# Calculate the dot product between the two vectors.
denseDotProduct = DenseVector.dot(myDenseVector, numpyVector) # DenseVector.dot() does the dot product

print 'myDenseVector:\n{0}'.format(myDenseVector)
print '\ndenseDotProduct:\n{0}'.format(denseDotProduct)