Beispiel #1
0
    def test_sparse_vector_indexing(self):
        sv = SparseVector(5, {1: 1, 3: 2})
        self.assertEqual(sv[0], 0.)
        self.assertEqual(sv[3], 2.)
        self.assertEqual(sv[1], 1.)
        self.assertEqual(sv[2], 0.)
        self.assertEqual(sv[4], 0.)
        self.assertEqual(sv[-1], 0.)
        self.assertEqual(sv[-2], 2.)
        self.assertEqual(sv[-3], 0.)
        self.assertEqual(sv[-5], 0.)
        for ind in [5, -6]:
            self.assertRaises(IndexError, sv.__getitem__, ind)
        for ind in [7.8, '1']:
            self.assertRaises(TypeError, sv.__getitem__, ind)

        zeros = SparseVector(4, {})
        self.assertEqual(zeros[0], 0.0)
        self.assertEqual(zeros[3], 0.0)
        for ind in [4, -5]:
            self.assertRaises(IndexError, zeros.__getitem__, ind)

        empty = SparseVector(0, {})
        for ind in [-1, 0, 1]:
            self.assertRaises(IndexError, empty.__getitem__, ind)
Beispiel #2
0
class VectorUDTTests(MLlibTestCase):

    dv0 = DenseVector([])
    dv1 = DenseVector([1.0, 2.0])
    sv0 = SparseVector(2, [], [])
    sv1 = SparseVector(2, [1], [2.0])
    udt = VectorUDT()

    def test_json_schema(self):
        self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt)

    def test_serialization(self):
        for v in [self.dv0, self.dv1, self.sv0, self.sv1]:
            self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v)))

    def test_infer_schema(self):
        rdd = self.sc.parallelize([
            Row(label=1.0, features=self.dv1),
            Row(label=0.0, features=self.sv1)
        ])
        df = rdd.toDF()
        schema = df.schema
        field = [f for f in schema.fields if f.name == "features"][0]
        self.assertEqual(field.dataType, self.udt)
        vectors = df.rdd.map(lambda p: p.features).collect()
        self.assertEqual(len(vectors), 2)
        for v in vectors:
            if isinstance(v, SparseVector):
                self.assertEqual(v, self.sv1)
            elif isinstance(v, DenseVector):
                self.assertEqual(v, self.dv1)
            else:
                raise TypeError("expecting a vector but got %r of type %r" %
                                (v, type(v)))
Beispiel #3
0
    def test_count_vectorizer_with_binary(self):
        dataset = self.spark.createDataFrame(
            [
                (
                    0,
                    "a a a b b c".split(" "),
                    SparseVector(3, {0: 1.0, 1: 1.0, 2: 1.0}),
                ),
                (
                    1,
                    "a a".split(" "),
                    SparseVector(3, {0: 1.0}),
                ),
                (
                    2,
                    "a b".split(" "),
                    SparseVector(3, {0: 1.0, 1: 1.0}),
                ),
                (
                    3,
                    "c".split(" "),
                    SparseVector(3, {2: 1.0}),
                ),
            ],
            ["id", "words", "expected"],
        )
        cv = CountVectorizer(binary=True, inputCol="words", outputCol="features")
        model = cv.fit(dataset)

        transformedList = model.transform(dataset).select("features", "expected").collect()

        for r in transformedList:
            feature, expected = r
            self.assertEqual(feature, expected)
    def test_calculate_cosine_similarities(self):
        """Test calculating similarity and return values are valid."""
        inputs = [
            ((10, 11), (SparseVector(5, {
                1: 2,
                2: 3,
                4: 2
            }), SparseVector(5, {
                2: 2,
                4: 3
            }))),
            ((10, 12), (SparseVector(5, {
                1: 2,
                2: 3,
                4: 2
            }), SparseVector(5, {
                1: 5,
                2: 3
            }))),
        ]

        data = sc.parallelize(inputs)

        result = calc_similarity(data).collect()

        self.assertEqual(len(result), len(inputs))
        # key is pair of ids
        self.assertEqual(len(result[0]), 2)
        # value contains cosine, 2 jaccard similarities
        self.assertEqual(len(result[0][1]), 3)

        # TODO: add Pearson
        self.assertTrue(
            all(isinstance(score, Decimal) for score in result[0][1]))
        self.assertTrue(all(score >= 0.0 for score in result[0][1]))
Beispiel #5
0
    def test_get_col_info_error_bad_shape(self):
        with spark_session('test_get_col_info_error_bad_shape') as spark:
            data_bad_shape = [[SparseVector(2, {0: 1.0})],
                              [SparseVector(1, {0: 1.0})]]
            schema = StructType([StructField('data', VectorUDT())])
            df = create_test_data_from_schema(spark, data_bad_shape, schema)

            with pytest.raises(ValueError):
                util._get_col_info(df)
Beispiel #6
0
 def test_serialize(self):
     self._test_serialize(DenseVector(range(10)))
     self._test_serialize(DenseVector(array([1.0, 2.0, 3.0, 4.0])))
     self._test_serialize(DenseVector(pyarray.array("d", range(10))))
     self._test_serialize(SparseVector(4, {1: 1, 3: 2}))
     self._test_serialize(SparseVector(3, {}))
     self._test_serialize(DenseMatrix(2, 3, range(6)))
     sm1 = SparseMatrix(3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0])
     self._test_serialize(sm1)
Beispiel #7
0
    def test_get_metadata(self):
        expected_metadata = \
            {
                'float': {
                    'spark_data_type': FloatType,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.NOCHANGE,
                    'max_size': 1,
                    'shape': 1
                },
                'dense': {
                    'spark_data_type': DenseVector,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.ARRAY,
                    'max_size': 2,
                    'shape': 2
                },
                'sparse': {
                    'spark_data_type': SparseVector,
                    'is_sparse_vector_only': True,
                    'intermediate_format': constants.CUSTOM_SPARSE,
                    'max_size': 1,
                    'shape': 2
                },
                'mixed': {
                    'spark_data_type': DenseVector,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.ARRAY,
                    'max_size': 2,
                    'shape': 2
                },
            }

        with spark_session('test_get_metadata') as spark:
            data = [[
                1.0,
                DenseVector([1.0, 1.0]),
                SparseVector(2, {0: 1.0}),
                DenseVector([1.0, 1.0])
            ],
                    [
                        1.0,
                        DenseVector([1.0, 1.0]),
                        SparseVector(2, {1: 1.0}),
                        SparseVector(2, {1: 1.0})
                    ]]
            schema = StructType([
                StructField('float', FloatType()),
                StructField('dense', VectorUDT()),
                StructField('sparse', VectorUDT()),
                StructField('mixed', VectorUDT())
            ])
            df = create_test_data_from_schema(spark, data, schema)

            metadata = util._get_metadata(df)
            self.assertDictEqual(metadata, expected_metadata)
Beispiel #8
0
 def test_hash(self):
     v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
     v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
     v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
     v4 = SparseVector(4, [(1, 1.0), (3, 2.5)])
     self.assertEqual(hash(v1), hash(v2))
     self.assertEqual(hash(v1), hash(v3))
     self.assertEqual(hash(v2), hash(v3))
     self.assertFalse(hash(v1) == hash(v4))
     self.assertFalse(hash(v2) == hash(v4))
Beispiel #9
0
 def test_one_hot_encoder():
     actual_df = fe.one_hot_encoder(source_df, input_cols=['id'])
     expected_df = op.create.df([
         ('id', LongType(), True), ('x', LongType(), True),
         ('y', LongType(), True), ('features', VectorUDT(), True),
         ('id***ONE_HOT_ENCODER', VectorUDT(), True)
     ], [(0, 1, 2, DenseVector([1.0, 0.5, -1.0]), SparseVector(2,
                                                               {0: 1.0})),
         (1, 2, 3, DenseVector([2.0, 1.0, 1.0]), SparseVector(2, {1: 1.0})),
         (2, 3, 4, DenseVector([4.0, 10.0, 2.0]), SparseVector(2, {}))])
     assert (expected_df.collect() == actual_df.collect())
Beispiel #10
0
    def test_norms(self):
        a = DenseVector([0, 2, 3, -1])
        self.assertAlmostEqual(a.norm(2), 3.742, 3)
        self.assertTrue(a.norm(1), 6)
        self.assertTrue(a.norm(inf), 3)
        a = SparseVector(4, [0, 2], [3, -4])
        self.assertAlmostEqual(a.norm(2), 5)
        self.assertTrue(a.norm(1), 7)
        self.assertTrue(a.norm(inf), 4)

        tmp = SparseVector(4, [0, 2], [3, 0])
        self.assertEqual(tmp.numNonzeros(), 1)
Beispiel #11
0
 def test_eq(self):
     v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
     v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
     v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
     v4 = SparseVector(6, [(1, 1.0), (3, 5.5)])
     v5 = DenseVector([0.0, 1.0, 0.0, 2.5])
     v6 = SparseVector(4, [(1, 1.0), (3, 2.5)])
     self.assertEqual(v1, v2)
     self.assertEqual(v1, v3)
     self.assertFalse(v2 == v4)
     self.assertFalse(v1 == v5)
     self.assertFalse(v1 == v6)
Beispiel #12
0
    def test_norms(self):
        a = DenseVector([0, 2, 3, -1])
        self.assertAlmostEqual(a.norm(2), 3.742, 3)
        self.assertTrue(a.norm(1), 6)
        self.assertTrue(a.norm(inf), 3)
        a = SparseVector(4, [0, 2], [3, -4])
        self.assertAlmostEqual(a.norm(2), 5)
        self.assertTrue(a.norm(1), 7)
        self.assertTrue(a.norm(inf), 4)

        tmp = SparseVector(4, [0, 2], [3, 0])
        self.assertEqual(tmp.numNonzeros(), 1)
Beispiel #13
0
class VectorUDTTests(MLlibTestCase):

    dv0 = DenseVector([])
    dv1 = DenseVector([1.0, 2.0])
    sv0 = SparseVector(2, [], [])
    sv1 = SparseVector(2, [1], [2.0])
    udt = VectorUDT()

    def test_json_schema(self):
        self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt)

    def test_serialization(self):
        for v in [self.dv0, self.dv1, self.sv0, self.sv1]:
            self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v)))

    def test_infer_schema(self):
        rdd = self.sc.parallelize([
            Row(label=1.0, features=self.dv1),
            Row(label=0.0, features=self.sv1)
        ])
        df = rdd.toDF()
        schema = df.schema
        field = [f for f in schema.fields if f.name == "features"][0]
        self.assertEqual(field.dataType, self.udt)
        vectors = df.rdd.map(lambda p: p.features).collect()
        self.assertEqual(len(vectors), 2)
        for v in vectors:
            if isinstance(v, SparseVector):
                self.assertEqual(v, self.sv1)
            elif isinstance(v, DenseVector):
                self.assertEqual(v, self.dv1)
            else:
                raise TypeError("expecting a vector but got %r of type %r" %
                                (v, type(v)))

    def test_unwrap_udt(self):
        df = self.spark.createDataFrame(
            [(Vectors.dense(1.0, 2.0, 3.0), ),
             (Vectors.sparse(3, {
                 1: 1.0,
                 2: 5.5
             }), )],
            ["vec"],
        )
        results = df.select(unwrap_udt("vec").alias("v2")).collect()
        unwrapped_vec = Row("type", "size", "indices", "values")
        expected = [
            Row(v2=unwrapped_vec(1, None, None, [1.0, 2.0, 3.0])),
            Row(v2=unwrapped_vec(0, 3, [1, 2], [1.0, 5.5])),
        ]
        self.assertEquals(results, expected)
 def project(*values):
     if not values:
         return SparseVector(size, {})
     hashVector = defaultdict(float)
     for d in values:
         if not d:
             continue
         for h, v in d.items():
             if not v:
                 continue
             k = abs(h) & (size - 1)
             s = (h >= 0) * 2 - 1
             hashVector[k] += v * s
     return SparseVector(size, dict(hashVector))
Beispiel #15
0
 def test_dot(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1., 2., 3., 4.]))
     lst = DenseVector([1, 2, 3, 4])
     mat = array([[1., 2., 3., 4.], [1., 2., 3., 4.], [1., 2., 3., 4.],
                  [1., 2., 3., 4.]])
     arr = pyarray.array('d', [0, 1, 2, 3])
     self.assertEqual(10.0, sv.dot(dv))
     self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat)))
     self.assertEqual(30.0, dv.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
     self.assertEqual(30.0, lst.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
     self.assertEqual(7.0, sv.dot(arr))
def jaccard_score(rates1: SparseVector, rates2: SparseVector) -> Decimal:
    """Compute Jaccard similarity coefficient
        (https://en.wikipedia.org/wiki/Jaccard_index)

    This is comparing rates by each users.

    This will consider the rate value as well s.t. even if bothe are rated,
    it is not regarded as the same if the rate values are not the same.
    """
    r1 = rates1.toArray()
    r2 = rates2.toArray()
    union = int(sum((r1 == r2) * (r1 > 0)))
    intersection = int(sum((r1 + r2) > 0))
    return Decimal(union) / intersection
Beispiel #17
0
    def test_get_col_info(self):
        with spark_session('test_get_col_info') as spark:
            data = [[
                0,
                0.0,
                None,
                [1, 1],
                DenseVector([1.0, 1.0]),
                SparseVector(2, {1: 1.0}),
                DenseVector([1.0, 1.0])
            ], [
                1,
                None,
                None,
                [1, 1],
                DenseVector([1.0, 1.0]),
                SparseVector(2, {1: 1.0}),
                SparseVector(2, {1: 1.0})
            ]]

            schema = StructType([
                StructField('int', IntegerType()),
                StructField('float', FloatType()),
                StructField('null', NullType()),
                StructField('array', ArrayType(IntegerType())),
                StructField('dense', VectorUDT()),
                StructField('sparse', VectorUDT()),
                StructField('mixed', VectorUDT())
            ])

            df = create_test_data_from_schema(spark, data, schema)
            all_col_types, col_shapes, col_max_sizes = util._get_col_info(df)

            expected = [
                ('int', {int}, 1, 1),
                ('float', {float, NullType}, 1, 1),
                ('null', {NullType}, 1, 1),
                ('array', {list}, 2, 2),
                ('dense', {DenseVector}, 2, 2),
                ('sparse', {SparseVector}, 2, 1),
                ('mixed', {DenseVector, SparseVector}, 2, 2)
            ]

            for expected_col_info in expected:
                col_name, col_types, col_shape, col_size = expected_col_info
                assert all_col_types[col_name] == col_types, col_name
                assert col_shapes[col_name] == col_shape, col_name
                assert col_max_sizes[col_name] == col_size, col_name
Beispiel #18
0
 def test_list(self):
     l = [0, 1]
     for lst_like in [l, np.array(l), DenseVector(l), SparseVector(len(l), range(len(l)), l),
                      pyarray.array('l', l), xrange(2), tuple(l)]:
         converted = TypeConverters.toList(lst_like)
         self.assertEqual(type(converted), list)
         self.assertListEqual(converted, l)
Beispiel #19
0
 def test_from_to_pandas(self):
     sparse_values = {0: 0.1, 1: 1.1}
     sparse_vector = SparseVector(len(sparse_values), sparse_values)
     pser = pd.Series([sparse_vector])
     psser = ps.Series([sparse_vector])
     self.assert_eq(pser, psser.to_pandas())
     self.assert_eq(ps.from_pandas(pser), psser)
Beispiel #20
0
 def test_dot(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1., 2., 3., 4.]))
     lst = DenseVector([1, 2, 3, 4])
     mat = array([[1., 2., 3., 4.],
                  [1., 2., 3., 4.],
                  [1., 2., 3., 4.],
                  [1., 2., 3., 4.]])
     arr = pyarray.array('d', [0, 1, 2, 3])
     self.assertEqual(10.0, sv.dot(dv))
     self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat)))
     self.assertEqual(30.0, dv.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
     self.assertEqual(30.0, lst.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
     self.assertEqual(7.0, sv.dot(arr))
Beispiel #21
0
def cluster_sparse(cluster_list, cluster_percent, cluster_avg):
    t = list(map(lambda x: x + 28, cluster_list))
    index = cluster_list + t
    value = cluster_percent + cluster_avg
    l = sorted(list(zip(index, value)), key=lambda x: x[0])
    k = [list(t) for t in zip(*l)]
    return SparseVector(56, k[0], k[1])
Beispiel #22
0
def train4(trainData):
    svc = SVC()
    y = []

    row_n = []
    col_n = []
    data_n = []
    row_index = 0
    feature_size = 0
    for i in trainData:
        feature = i["features"]
        feature_size = len(feature)
        dic = [(i, a) for i, a in enumerate(feature)]
        sv = SparseVector(feature_size, dic)
        for c in sv.indices:
            row_n.append(row_index)
            col_n.append(c)
            data_n.append(sv.values[list(sv.indices).index(c)])

        y.append(i["label"])

        row_index += 1
    X = sp.csc_matrix((data_n, (row_n, col_n)), shape=(row_index, feature_size))
    model = svc.fit(X, y)
    joblib.dump(model, '/tmp/sk_example.pkl')
    def transform(self,X_rdd,y_rdd=None):
        '''
        given X RDD (and optionally y RDD), output dataframe with term frequency feature vector and labels
        '''    
        #check input type
        if type(X_rdd) != RDD:
            raise TypeError("Arguments must be pySpark RDDs")
        if y_rdd and type(y_rdd) != RDD:
            raise TypeError("Arguments must be pySpark RDDs")
        
        #get term frequencies
        X = X_rdd.map(self._term_frequency).cache()
        
        #convert to sparse
        X = X.map(lambda (hash,features): (hash,SparseVector(self.num_features,np.nonzero(features)[0],features[features>0])))

        #check if labels exist
        if y_rdd:
            #combine X and y into single dataframe
            X = X.zipWithIndex().map(lambda r: (r[1],r[0]))
            y = y_rdd.zipWithIndex().map(lambda r: (r[1],r[0]))
            data = X.join(y).map(lambda (idx,((hash,features),label)): (hash,features,label))
            schema = StructType([StructField('hash',StringType(),True),StructField('features',VectorUDT(),True),StructField('label',StringType(),True)])
            data = data.toDF(schema)
            data = data.withColumn('label',data.label.cast(DoubleType()))
        
        else:
            schema = StructType([StructField('hash',StringType(),True),StructField("features", VectorUDT(), True)])
            data = X.toDF(schema)
            
        return data
Beispiel #24
0
 def udt_pdf(self):
     sparse_values = {0: 0.2, 1: 1.0}
     psers = {
         "this": self.pser,
         "that": pd.Series([SparseVector(len(sparse_values), sparse_values)]),
     }
     return pd.concat(psers, axis=1)
Beispiel #25
0
def test(allHex,hashFiles,sc,sqlc,path,featureFitModel):

    bytesFiles = hashFiles.map(lambda x: "gs://uga-dsp/project2/data/bytes/"+ x+".bytes")
    def fun(accum,x):

        return accum+','+x

    bytesFileString = bytesFiles.reduce(fun)
    rdd1= sc.wholeTextFiles(bytesFileString,20)

    bytesRdd = rdd1.map(lambda x: x[1].split()).map(lambda x: [str(int(word,16)) for word in x if word in allHex.value]).zipWithIndex().map(lambda x: (x[1],x[0]))
    Vec= bytesRdd.map(lambda x: (x[0],createVector(x[1])))
    sparseVec = Vec.map(lambda x: (x[0],SparseVector(256,numpy.nonzero(x[1])[0],x[1][x[1]>0])))

    ngramFrame = sqlc.createDataFrame(sparseVec,["did","1grams"])

    twoGram = NGram(n=2, inputCol="1grams", outputCol="2grams")
    ngramFrame = twoGram.transform(ngramFrame)

    featuresDF = ngramFrame.rdd.map(lambda x: Row(did=x['docId'],docFeatures=x['1grams']+x['2grams'])).toDF()

    featuresCV = featureFitModel.transform(ngramFrame)

    testData = featuresCV.drop('docFeatures')
    testData.persist(StorageLevel(True, True, False, False, 1))
    saveData(ngramFrame,path)
    testData.show()
Beispiel #26
0
    def lda_train(self, file):
        json_rdd, count = self.load_train_titleFeature_rdd(file)
        vocabulary_set = json_rdd.map(lambda line : get_title_words(line))\
                                 .flatMap(lambda word : word).distinct().collect()

        vocab_size = self.sc.broadcast(max(vocabulary_set) + 1)

        print('vocabulart size: ' + str(vocab_size.value))

        sparseVec_rdd = json_rdd.map(lambda line : cast_dict_str2int(line.get('title_features')))\
                                .map(lambda value : SparseVector(vocab_size.value, value))
        zip_rdd = sparseVec_rdd.zipWithIndex()
        lda_train_rdd = zip_rdd.map(lambda x: [x[1], x[0]]).cache()

        K = 4
        max_iter = 10
        seed = 1024

        lda_train_df = self.sqlContext.createDataFrame(lda_train_rdd.collect(),
                                                       ["id", "features"])
        lda = LDA(k=K, maxIter=max_iter, seed=seed)
        lda_model = lda.fit(lda_train_df)

        print('LDA model vocabSize : ' + str(lda_model.vocabSize()))
        print(lda_model.isDistributed())
        lda_model.describeTopics().show()

        #os.system("hadoop fs -rmr {}".format(self.lda_model_path))
        #os.system("hadoop fs -rmr {}".format(self.lda_path))

        lda_model.write().overwrite().save(self.lda_model_path)

        self.sc.stop()
Beispiel #27
0
    def test_squared_distance(self):
        def squared_distance(a, b):
            if isinstance(a, Vector):
                return a.squared_distance(b)
            else:
                return b.squared_distance(a)

        sv = SparseVector(4, {1: 1, 3: 2})
        dv = DenseVector(array([1., 2., 3., 4.]))
        lst = DenseVector([4, 3, 2, 1])
        lst1 = [4, 3, 2, 1]
        arr = pyarray.array('d', [0, 2, 1, 3])
        narr = array([0, 2, 1, 3])
        self.assertEqual(15.0, squared_distance(sv, dv))
        self.assertEqual(25.0, squared_distance(sv, lst))
        self.assertEqual(20.0, squared_distance(dv, lst))
        self.assertEqual(15.0, squared_distance(dv, sv))
        self.assertEqual(25.0, squared_distance(lst, sv))
        self.assertEqual(20.0, squared_distance(lst, dv))
        self.assertEqual(0.0, squared_distance(sv, sv))
        self.assertEqual(0.0, squared_distance(dv, dv))
        self.assertEqual(0.0, squared_distance(lst, lst))
        self.assertEqual(25.0, squared_distance(sv, lst1))
        self.assertEqual(3.0, squared_distance(sv, arr))
        self.assertEqual(3.0, squared_distance(sv, narr))
 def test_model_logistic_regression_binary_class(self):
     import inspect
     import os
     this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
     input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt")
     original_data = self.spark.read.format("libsvm").load(input_path)
     #
     # truncate the features
     #
     self.spark.udf.register("truncateFeatures", lambda x: SparseVector(5, range(0,5), x.toArray()[125:130]),
                             VectorUDT())
     data = original_data.selectExpr("label", "truncateFeatures(features) as features")
     lr = LogisticRegression(maxIter=100, tol=0.0001)
     model = lr.fit(data)
     # the name of the input for Logistic Regression is 'features'
     model_onnx = convert_sparkml(model, 'sparkml logistic regression', [('features', FloatTensorType([1, model.numFeatures]))])
     self.assertTrue(model_onnx is not None)
     self.assertTrue(model_onnx.graph.node is not None)
     # run the model
     import pandas
     predicted = model.transform(data)
     data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     expected = [
         predicted.toPandas().prediction.values.astype(numpy.float32),
         predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     ]
     dump_data_and_sparkml_model(data_np, expected, model, model_onnx,
                                 basename="SparkmlLogisticRegression")
Beispiel #29
0
    def test_count_vectorizer_from_vocab(self):
        model = CountVectorizerModel.from_vocabulary(
            ["a", "b", "c"], inputCol="words", outputCol="features", minTF=2
        )
        self.assertEqual(model.vocabulary, ["a", "b", "c"])
        self.assertEqual(model.getMinTF(), 2)

        dataset = self.spark.createDataFrame(
            [
                (
                    0,
                    "a a a b b c".split(" "),
                    SparseVector(3, {0: 3.0, 1: 2.0}),
                ),
                (
                    1,
                    "a a".split(" "),
                    SparseVector(3, {0: 2.0}),
                ),
                (
                    2,
                    "a b".split(" "),
                    SparseVector(3, {}),
                ),
            ],
            ["id", "words", "expected"],
        )

        transformed_list = model.transform(dataset).select("features", "expected").collect()

        for r in transformed_list:
            feature, expected = r
            self.assertEqual(feature, expected)

        # Test an empty vocabulary
        with QuietTest(self.sc):
            with self.assertRaisesRegex(Exception, "vocabSize.*invalid.*0"):
                CountVectorizerModel.from_vocabulary([], inputCol="words")

        # Test model with default settings can transform
        model_default = CountVectorizerModel.from_vocabulary(["a", "b", "c"], inputCol="words")
        transformed_list = (
            model_default.transform(dataset)
            .select(model_default.getOrDefault(model_default.outputCol))
            .collect()
        )
        self.assertEqual(len(transformed_list), 3)
Beispiel #30
0
    def test_random_forrest_regression(self):
        this_script_dir = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data",
                                  "sample_libsvm_data.txt")
        original_data = self.spark.read.format("libsvm").load(input_path)
        #
        # truncate the features
        #
        feature_count = 5
        self.spark.udf.register(
            "truncateFeatures",
            lambda x: SparseVector(feature_count, range(0, feature_count),
                                   x.toArray()[125:130]), VectorUDT())
        data = original_data.selectExpr(
            "cast(label as string) as label",
            "truncateFeatures(features) as features")
        label_indexer = StringIndexer(inputCol="label",
                                      outputCol="indexedLabel")
        feature_indexer = VectorIndexer(inputCol="features",
                                        outputCol="indexedFeatures",
                                        maxCategories=10,
                                        handleInvalid='error')

        rf = RandomForestRegressor(labelCol="indexedLabel",
                                   featuresCol="indexedFeatures",
                                   numTrees=10)
        pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf])
        model = pipeline.fit(data)
        model_onnx = convert_sparkml(
            model,
            'Sparkml RandomForest Regressor',
            [('label', StringTensorType([1, 1])),
             ('features', FloatTensorType([1, feature_count]))],
            spark_session=self.spark)
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data.limit(1))
        data_np = {
            'label':
            data.limit(1).toPandas().label.values,
            'features':
            data.limit(1).toPandas().features.apply(
                lambda x: pandas.Series(x.toArray())).values.astype(
                    numpy.float32)
        }
        expected = [
            predicted.toPandas().indexedLabel.values.astype(numpy.int64),
            predicted.toPandas().prediction.values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlRandomForestRegressor")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['indexedLabel', 'prediction'],
                                               data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)
Beispiel #31
0
 def test_list_int(self):
     for indices in [[1.0, 2.0], np.array([1.0, 2.0]), DenseVector([1.0, 2.0]),
                     SparseVector(2, {0: 1.0, 1: 2.0}), xrange(1, 3), (1.0, 2.0),
                     pyarray.array('d', [1.0, 2.0])]:
         vs = VectorSlicer(indices=indices)
         self.assertListEqual(vs.getIndices(), [1, 2])
         self.assertTrue(all([type(v) == int for v in vs.getIndices()]))
     self.assertRaises(TypeError, lambda: VectorSlicer(indices=["a", "b"]))
def jaccard_score_binary(rates1: SparseVector,
                         rates2: SparseVector) -> Decimal:
    """Compute Jaccard similarity coefficient
        (https://en.wikipedia.org/wiki/Jaccard_index)

    This is comparing rates by each users.

    This will ignore the actual rate, and this assumes people who watch
    the same movies have similar preference, therefore those
    movies are similar.
    """
    # is there efficient way to handle sparse vector?
    r1 = rates1.toArray()
    r2 = rates2.toArray()
    union = int(sum(((r1 > 0) == (r2 > 0)) * (r1 > 0)))
    intersection = int(sum((r1 + r2) > 0))
    return Decimal(union) / intersection