Example #1
0
def read_csv(path):
    df = spark.read.csv(path, header=True, inferSchema=True)

    udf = UserDefinedFunction(lambda x: Vectors.parse(x), VectorUDT())
    new_df = df.withColumn('features', udf(df.features))

    return new_df
    def transform(self, X_rdd, y_rdd=None):
        '''
        given X RDD (and optionally y RDD), output dataframe with term frequency feature vector and labels
        '''
        #check input type
        if type(X_rdd) != RDD:
            raise TypeError("Arguments must be pySpark RDDs")
        if y_rdd and type(y_rdd) != RDD:
            raise TypeError("Arguments must be pySpark RDDs")

        #convert X to URL paths
        X = X_rdd.map(
            lambda x:
            'https://s3.amazonaws.com/eds-uga-csci8360/data/project2/binaries/'
            + x + '.bytes')
        X = X.map(self._term_frequency)

        #check if labels exist
        if y_rdd:
            #combine X and y into single dataframe
            X = X.zipWithIndex().map(lambda r: (r[1], r[0]))
            y = y_rdd.zipWithIndex().map(lambda r: (r[1], r[0]))
            data = X.join(y).map(lambda r: r[1])
            data = data.toDF(['features', 'label'])

        else:
            X = X.map(lambda row: [row])
            schema = StructType([StructField("features", VectorUDT(), True)])
            data = X.toDF(schema)

        return data
Example #3
0
class VectorUDTTests(PySparkTestCase):

    dv0 = DenseVector([])
    dv1 = DenseVector([1.0, 2.0])
    sv0 = SparseVector(2, [], [])
    sv1 = SparseVector(2, [1], [2.0])
    udt = VectorUDT()

    def test_json_schema(self):
        self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt)

    def test_serialization(self):
        for v in [self.dv0, self.dv1, self.sv0, self.sv1]:
            self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v)))

    def test_infer_schema(self):
        sqlCtx = SQLContext(self.sc)
        rdd = self.sc.parallelize(
            [LabeledPoint(1.0, self.dv1),
             LabeledPoint(0.0, self.sv1)])
        srdd = sqlCtx.inferSchema(rdd)
        schema = srdd.schema
        field = [f for f in schema.fields if f.name == "features"][0]
        self.assertEqual(field.dataType, self.udt)
        vectors = srdd.map(lambda p: p.features).collect()
        self.assertEqual(len(vectors), 2)
        for v in vectors:
            if isinstance(v, SparseVector):
                self.assertEqual(v, self.sv1)
            elif isinstance(v, DenseVector):
                self.assertEqual(v, self.dv1)
            else:
                raise ValueError("expecting a vector but got %r of type %r" %
                                 (v, type(v)))
Example #4
0
class VectorUDTTests(MLlibTestCase):

    dv0 = DenseVector([])
    dv1 = DenseVector([1.0, 2.0])
    sv0 = SparseVector(2, [], [])
    sv1 = SparseVector(2, [1], [2.0])
    udt = VectorUDT()

    def test_json_schema(self):
        self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt)

    def test_serialization(self):
        for v in [self.dv0, self.dv1, self.sv0, self.sv1]:
            self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v)))

    def test_infer_schema(self):
        rdd = self.sc.parallelize([LabeledPoint(1.0, self.dv1), LabeledPoint(0.0, self.sv1)])
        df = rdd.toDF()
        schema = df.schema
        field = [f for f in schema.fields if f.name == "features"][0]
        self.assertEqual(field.dataType, self.udt)
        vectors = df.rdd.map(lambda p: p.features).collect()
        self.assertEqual(len(vectors), 2)
        for v in vectors:
            if isinstance(v, SparseVector):
                self.assertEqual(v, self.sv1)
            elif isinstance(v, DenseVector):
                self.assertEqual(v, self.dv1)
            else:
                raise TypeError("expecting a vector but got %r of type %r" % (v, type(v)))

    def test_row_matrix_from_dataframe(self):
        from pyspark.sql.utils import IllegalArgumentException

        df = self.spark.createDataFrame([Row(Vectors.dense(1))])
        row_matrix = RowMatrix(df)
        self.assertEqual(row_matrix.numRows(), 1)
        self.assertEqual(row_matrix.numCols(), 1)
        with self.assertRaises(IllegalArgumentException):
            RowMatrix(df.selectExpr("'monkey'"))

    def test_indexed_row_matrix_from_dataframe(self):
        from pyspark.sql.utils import IllegalArgumentException

        df = self.spark.createDataFrame([Row(int(0), Vectors.dense(1))])
        matrix = IndexedRowMatrix(df)
        self.assertEqual(matrix.numRows(), 1)
        self.assertEqual(matrix.numCols(), 1)
        with self.assertRaises(IllegalArgumentException):
            IndexedRowMatrix(df.drop("_1"))

    def test_row_matrix_invalid_type(self):
        rows = self.sc.parallelize([[1, 2, 3], [4, 5, 6]])
        invalid_type = ""
        matrix = RowMatrix(rows)
        self.assertRaises(TypeError, matrix.multiply, invalid_type)

        irows = self.sc.parallelize([IndexedRow(0, [1, 2, 3]), IndexedRow(1, [4, 5, 6])])
        imatrix = IndexedRowMatrix(irows)
        self.assertRaises(TypeError, imatrix.multiply, invalid_type)
Example #5
0
def read_csv(path):
    df = spark.read.csv(path, header=True, inferSchema=True)

    udf = UserDefinedFunction(lambda x: Vectors.parse(x), VectorUDT())
    # https://spark.apache.org/docs/latest/ml-migration-guides.html
    new_df = MLUtils.convertVectorColumnsToML(
        df.withColumn('features', udf(df.features)))

    return new_df
Example #6
0
 def _sax_transform(self, df):
     
     # normalize series
     normalize_udf = F.udf(lambda x: Vectors.dense((x.toArray()-np.mean(x.toArray()))/np.std(x.toArray())),
                                 returnType=VectorUDT())
     df = df.withColumn("normalized_serie",normalize_udf(df[self._featuresCol]))
     
     # piecewise aggregate aproXimation (PAA)
     to_paa_udf = F.udf(lambda x: Vectors.dense(self._to_paa(x.toArray())),
                         returnType=VectorUDT())
     df = df.withColumn("paa_serie",to_paa_udf(df["normalized_serie"]))       
     
     # discretization
     discretize_udf = F.udf(lambda x: Vectors.dense(self._discretize(x.toArray())),
                          returnType=VectorUDT())
     df = df.withColumn("discretized_serie",discretize_udf(df["paa_serie"]))
     
     return df
Example #7
0
def get_features(df):
    """
    Proj Denoise and feature extraction on X, Y and Z from the data frame we have
    after tasks_to_intervals
    
    """
    
    schema = StructType([
    StructField("proj_ver", VectorUDT(), False),
    StructField("proj_hor", VectorUDT(), False)
    ])

    proj_func = udf(proj_for_spark.project_gravity_xyz, schema)

    df = df['X', 'Y', 'Z', 'key'].withColumn('proj', proj_func("X", "Y", "Z"))
    df = df.select('key',
                 'proj.proj_ver', 
                 'proj.proj_hor')
    
    df = df['proj_ver','proj_hor', 'key'].withColumn('denoised_ver',
                    utils_function_spark.denoise_func("proj_ver")).withColumn('denoised_hor', 
                                utils_function_spark.denoise_func("proj_hor"))
    df = df.select('key', "denoised_ver", "denoised_hor") 
    
    df = df["denoised_ver", "denoised_hor", 'key'].withColumn('rel_features_ver', 
                        utils_function_spark.toDWT_relative_udf("denoised_ver")).\
                        withColumn('cont_features_ver',
                        utils_function_spark.toDWT_cont_udf("denoised_ver"))

    df = df["rel_features_ver", "cont_features_ver", "denoised_hor", 'key'].\
                       withColumn('rel_features_hor', 
                       utils_function_spark.toDWT_relative_udf("denoised_hor")).\
                       withColumn('cont_features_hor',
                       utils_function_spark.toDWT_cont_udf("denoised_hor"))


    df = df.select('key', 'rel_features_ver', 'cont_features_ver',
                                 'rel_features_hor', 'cont_features_hor')

    
    return df
def data_frame_from_file(sqlContext, file_name, fraction):
    lines = sc.textFile(file_name).sample(False, fraction)
    parts = lines.map(lambda l: map(lambda s: int(s), l.split(",")))
    samples = parts.map(lambda p: (float(p[
        0]), DenseVector(map(lambda el: el / 255.0, p[1:]))))

    fields = [
        StructField("label", DoubleType(), True),
        StructField("features", VectorUDT(), True)
    ]
    schema = StructType(fields)

    data = sqlContext.createDataFrame(samples, schema)
    return data
    def cat2Num(self, df, indices):
        """
            Write your code!
        """

        # function to select one feature from a list of feature
        def select_feature(raw_feature, index):
            return raw_feature[index]

        # function to select remove features from a list of feature
        def delete_feature(raw_feature, indices):
            feature = [
                i for j, i in enumerate(raw_feature) if j not in indices
            ]
            return Vectors.dense(feature)

        # Get categorical features and perform One-Hot Encoding
        df_prev = df
        for index in indices:
            select_feature_udf = udf(lambda x: select_feature(x, index),
                                     StringType())
            df_encoded = df_prev.withColumn("cat_" + str(index),
                                            select_feature_udf("rawFeatures"))
            # string index
            stringIndexer = StringIndexer(inputCol="cat_" + str(index),
                                          outputCol="cat_index_" + str(index))
            model_stringIndexer = stringIndexer.fit(df_encoded)
            indexed = model_stringIndexer.transform(df_encoded)

            # one-hot encode
            encoder = OneHotEncoder(inputCol="cat_index_" + str(index),
                                    outputCol="cat_vector_" + str(index),
                                    dropLast=False)
            encoded = encoder.transform(indexed)
            df_prev = encoded

        # Get continious features by removing categorical indices from rawFeatures
        delete_feature_udf = udf(lambda x: delete_feature(x, indices),
                                 VectorUDT())
        df_cont = df_prev.withColumn("cont", delete_feature_udf("rawFeatures"))

        # Combine one-hot encoded categorical and continious features
        feature = []
        for index in indices:
            feature.append("cat_vector_" + str(index))
        feature.append("cont")
        assembler = VectorAssembler(inputCols=feature, outputCol="features")
        df_transformed = assembler.transform(df_cont) \
            .select("id","rawFeatures","features")
        return df_transformed
Example #10
0
def pipe_scale_cols(df, with_mean=True, with_std=True, use_dense_vector=True):
    newdf = df
    if use_dense_vector:
        to_dense_udf = udf(lambda v: v.toDense, VectorUDT())
        dense_df = newdf.withColumn("features-dense",
                                    to_dense_udf(newdf["features"]))
        newdf = dense_df.drop("features").withColumnRenamed(
            "features-dense", "features")

    scaler = StandardScaler(withMean=with_mean,
                            withStd=False,
                            inputCol="features",
                            outputCol="features-scaled")
    model = scaler.fit(newdf)
    newdf = model.transform(newdf)
    newdf = newdf.drop("features")
    newdf = newdf.withColumnRenamed("features-scaled", "features")
    return newdf
Example #11
0
    def transform(self, df):
        
        # dataframe columns
        cols = df.columns
        
        # make SAX transformation
        df = self._sax_transform(df)
        
        # calculate distance to centroids
        distance_to_centroids_udf = F.udf(lambda x: Vectors.dense(self._distance_to_centroids(x.toArray().astype(int))),
                                            returnType=VectorUDT())
        df = df.withColumn("dist_centroids",distance_to_centroids_udf(df["discretized_serie"]))

        # assignation
        min_distance_udf = F.udf(lambda x: int(np.argmin(x.toArray())),returnType=IntegerType())
        df = df.withColumn("assignation",min_distance_udf(df["dist_centroids"]))
        
        # return prediction dataframe
        return df.select(cols+["assignation"])
Example #12
0
    def _fit(self, df):
        
        self._centroid_init_function(df)
        
        # fit kmeans algorithm
        cost_values = [np.inf]
        for it in range(self._maxIter):
                                
            # calculate distance to centroids
            distance_to_centroids_udf = F.udf(lambda x: Vectors.dense(self._distance_to_centroids(x.toArray().astype(int))),
                                                returnType=VectorUDT())
            df = df.withColumn("dist_centroids",distance_to_centroids_udf(df["discretized_serie"]))

            # assignation
            min_distance_udf = F.udf(lambda x: int(np.argmin(x.toArray())),returnType=IntegerType())
            df = df.withColumn("assignation",min_distance_udf(df["dist_centroids"]))

            # recalculate centroids
            df_centroids = df.select(["assignation","paa_serie","dist_centroids"])
            centroids_samples = df_centroids.map(lambda x: (x[0],(x[1].toArray(),1,x[2].toArray()[x[0]])))
            centroids_sum = centroids_samples.reduceByKey(lambda x,y: (np.add(x[0],y[0]),x[1]+y[1],x[2]+y[2]))
            centroids_mean = centroids_sum.map(lambda x: (x[0],x[1][0]/float(x[1][1]))).collect()
            centroids_mean = sorted(centroids_mean)
            for i,centroid in centroids_mean:
                self.centroids_[i] = self._discretize(centroid)

            # calculate cost
            cost_sum = centroids_sum.map(lambda x: x[1][2]).reduce(add)
            
            # check for convergence
            if abs(cost_values[-1] - cost_sum) <= self._tol:
                # convergence reached
                cost_values.append(cost_sum)
                break
                
            cost_values.append(cost_sum)
            
        self.cost_ = cost_values[1:]
        return df
Example #13
0
    def _kpp_init(self, df):

        self.centroids_ = []
        new_centroid = df.select("discretized_serie").sample(False,0.5).first()[0]
        self.centroids_.append(new_centroid.toArray())

        sw = True
        while(sw):
            df_aux = df.select("discretized_serie")
            
            # calculate distance to centroids
            distance_to_centroids_udf = F.udf(lambda x: Vectors.dense(self._distance_to_centroids(x.toArray().astype(int))),
                                                returnType=VectorUDT())
            df_aux = df_aux.withColumn("dist_centroids",distance_to_centroids_udf(df_aux["discretized_serie"]))

            # calculate assignation
            min_distance_udf = F.udf(lambda x: int(np.argmin(x.toArray())),returnType=IntegerType())
            df_aux = df_aux.withColumn("assignation",min_distance_udf(df_aux["dist_centroids"]))

            # distance to nearest centroid
            nearest_centroid_udf = F.udf(lambda x: float(np.amin(x.toArray())),returnType=FloatType())
            df_aux = df_aux.withColumn("dist_nearest_centroid",nearest_centroid_udf(df_aux["dist_centroids"]))
            
            # order centroids by distance
            df_aux = df_aux.withColumn("dist_nearest_centroid_reversed",(-1)*df_aux["dist_nearest_centroid"])
            window = Window.partitionBy("assignation").orderBy("dist_nearest_centroid_reversed")
            df_aux = df_aux.select(df_aux["discretized_serie"],df_aux["dist_nearest_centroid_reversed"],F.row_number().over(window).alias("ordering"))
            df_aux = df_aux.where(df_aux["ordering"]==4)
            
            # get new centroids
            new_centroids = df_aux.select("discretized_serie").collect()
            for new_centroid in new_centroids:
                self.centroids_.append(new_centroid["discretized_serie"].toArray())
                if len(self.centroids_)>=self._k:
                    sw = False
                    break
                    
        self.centroids_ = [centroid.astype(int) for centroid in self.centroids_]
def vectorizeBi(row, dico):
    vector_dict = {}
    for w in row.bigrams:
        if w in dico:
            vector_dict[dico[w]] = 1
    return (row.label, SparseVector(len(dico), vector_dict))


# In[321]:

from pyspark.mllib.linalg import VectorUDT
from pyspark.sql.types import StructType, StructField, DoubleType

schema = StructType([
    StructField('label', DoubleType(), True),
    StructField('bigramVectors', VectorUDT(), True)
])

# In[322]:

from functools import partial
print "Converting bigrams to sparse vectors in a dataframe for the train set"
t0 = time()
features = dfTrain.map(partial(vectorizeBi,
                               dico=dict_broad.value)).toDF(schema)
features.take(1)
tt = time() - t0
print "Done in {} second".format(round(tt, 3))

# In[323]:

def vectorize(row, dico):
    vector_dict = {}
    for w in row.words:
        if w in dico:
            vector_dict[dico[w]] = 1
    return (row.label, SparseVector(len(dico), vector_dict))


from pyspark.mllib.linalg import VectorUDT
from pyspark.sql.types import StructType, StructField, DoubleType

schema = StructType([
    StructField('label', DoubleType(), True),
    StructField('Vectors', VectorUDT(), True)
])

features = dfTrainTok.map(partial(vectorize,
                                  dico=dict_broad.value)).toDF(schema)

print "Features created"

from pyspark.ml.feature import StringIndexer

string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(features)
featIndexed = string_indexer_model.transform(features)

print "labels indexed"
    length = len(dicoUni)
    for w in row.words:
        if w in dicoUni:
            vector_dict[dicoUni[w]]=1
    for tri in row.wordTrigrams:
        if tri in dicoTri:
            vector_dict[dicoTri[tri]+length]=1
    return (row.label,SparseVector(length+len(dicoTri),vector_dict))


# In[15]:

from pyspark.mllib.linalg import VectorUDT
from pyspark.sql.types import StructType, StructField,DoubleType,ArrayType,StringType
t = ArrayType(StringType())
schema = StructType([StructField('label',DoubleType(),True),                     StructField('featureVectors',VectorUDT(),True)])


# In[16]:

print "Creating feature vectors"
t0 = time()
dfTrainVec=dfTrain.map(partial(vectorize,dicoUni=dict_broad.value,dicoTri=dictTri_broad.value)).toDF(schema)
dfTestVec=dfTest.map(partial(vectorize,dicoUni=dict_broad.value,dicoTri=dictTri_broad.value)).toDF(schema)
tt = time() - t0
print "Dataframe created in {} second".format(round(tt,3))


# In[19]:

print "Indexing labels"
Example #17
0
def project(comp):
    return udf(lambda s: Vectors.dense(np.dot(s, comp)), VectorUDT())
Example #18
0
df_flat = df_test3.rdd.map(lambda raw:  ((raw[0], raw[1], raw[2], raw[3]) , 
                                         list(zip(raw[4], raw[5], raw[6])))).\
                          flatMapValues(lambda raw :raw)

df_flat = df_flat.map(lambda raw: (raw[0],raw[1][0],raw[1][1],raw[1][2])).\
                     toDF(['key', 'X', 'Y', 'Z'])

   




########################################################################################
schema = StructType([
    StructField("proj_ver", VectorUDT(), False),
    StructField("proj_hor", VectorUDT(), False)
])

#proj_new = partial(project_gravity_core, rel = True)
proj_func = udf(project_gravity_xyz, schema)

df_proj = df_flat['X', 'Y', 'Z', 'key'].withColumn('proj', proj_func("X", "Y", "Z"))
df_proj = df_proj.select('key',
                 'proj.proj_ver', 
                 'proj.proj_hor')
df_proj.show(2)

########################################################################################
from scipy.signal import butter, filtfilt
from future.utils import lmap
Example #19
0
    return _convert_to_vector(array)


if __name__ == "__main__":

    FEATURES_COL = "features"

    if len(sys.argv) != 3:
        print("Usage: kmeans_example.py <file> <k>", file=sys.stderr)
        exit(-1)
    path = sys.argv[1]
    k = sys.argv[2]

    spark = SparkSession.builder.appName("PythonKMeansExample").getOrCreate()

    lines = spark.read.text(path).rdd
    data = lines.map(parseVector)
    row_rdd = data.map(lambda x: Row(x))
    schema = StructType([StructField(FEATURES_COL, VectorUDT(), False)])
    df = spark.createDataFrame(row_rdd, schema)

    kmeans = KMeans().setK(2).setSeed(1).setFeaturesCol(FEATURES_COL)
    model = kmeans.fit(df)
    centers = model.clusterCenters()

    print("Cluster Centers: ")
    for center in centers:
        print(center)

    spark.stop()
Example #20
0
dict_broad=sc.broadcast(dictionaryBigrams)


from pyspark.mllib.linalg import SparseVector
def vectorizeBi(row,dico):
    vector_dict={}
    for w in row.bigrams:
        if w in dico:
            vector_dict[dico[w]]=1
    return (row.label,SparseVector(len(dico),vector_dict))


from pyspark.mllib.linalg import VectorUDT
from pyspark.sql.types import StructType, StructField,DoubleType

schema = StructType([StructField('label',DoubleType(),True),StructField('bigramVectors',VectorUDT(),True)])


features=dfBigram.map(partial(vectorizeBi,dico=dict_broad.value)).toDF(schema)

print "Features from bigrams created"

from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import DecisionTreeClassifier

string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(features)
featIndexed = string_indexer_model.transform(features)

print "labels indexed"
Example #21
0
from test_helper import Test
Test.assertEquals(irisDFZeroIndex.select('label').map(lambda r: r[0]).take(3), [0, 0, 0],
                  'incorrect value for irisDFZeroIndex')

# COMMAND ----------

# MAGIC %md
# MAGIC You'll also notice that we have four values for features and that those values are stored as a `SparseVector`.  We'll reduce those down to two values (for visualization purposes) and convert them to a `DenseVector`.  To do that we'll need to create a `udf` and apply it to our dataset.  Here's a `udf` reference for [Python](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.udf) and for [Scala](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.UserDefinedFunction).
# MAGIC  
# MAGIC Note that you can call the `toArray` method on a `SparseVector` to obtain an array, and you can convert an array into a `DenseVector` using the `Vectors.dense` method.

# COMMAND ----------

# ANSWER
from pyspark.sql.functions import udf
# Note that VectorUDT and MatrixUDT are found in linalg while other types are in sql.types
# VectorUDT should be the return type of the udf
from pyspark.mllib.linalg import Vectors, VectorUDT

# Take the first two values from a SparseVector and convert them to a DenseVector
firstTwoFeatures = udf(lambda sv: Vectors.dense(sv.toArray()[:2]), VectorUDT())

irisTwoFeatures = irisDFZeroIndex.select(firstTwoFeatures('features').alias('features'), 'label').cache()
display(irisTwoFeatures)

# COMMAND ----------

# TEST
Test.assertEquals(str(irisTwoFeatures.first()), 'Row(features=DenseVector([-0.5556, 0.25]), label=0.0)',
                  'incorrect definition of firstTwoFeatures')
Example #22
0
    def _transform(self, dataset):
        # dataset format -> peer_paper_id | paper_id | user_id | citeulike_paper_id
        def diff(v1, v2):
            """
            Calculate the difference between two arrays.

            :return: array of their difference
            """
            array1 = numpy.array(v1)
            array2 = numpy.array(v2)
            result = numpy.subtract(array1, array2)
            return Vectors.dense(result)

        def split_papers(papers_id_list):
            """
            Shuffle the input list of paper ids and divide it into two lists. The ratio is 50/50.

            :param: papers_id_list initial list of paper ids that will be split
            :return: two arrays with paper ids. The first one contains the "positive paper ids" or those
            which difference will be added with label 1. The second - "the negative paper ids" -  added with
            label 0.
            """
            shuffle(papers_id_list)
            ratio = int(0.5 * len(papers_id_list))
            positive_class_set = papers_id_list[:ratio]
            negative_class_set = papers_id_list[ratio:]
            return [positive_class_set, negative_class_set]

        vector_diff_udf = F.udf(diff, VectorUDT())
        split_papers_udf = F.udf(split_papers,
                                 ArrayType(ArrayType(StringType())))

        if (self.pairs_generation == "edp"
            ):  # self.Pairs_Generation.EQUALLY_DISTRIBUTED_PAIRS):
            # 50 % of the paper_pairs with label 1, 50% with label 0
            peers_per_paper = None
            if (self.model_training == "gm"):
                # get a list of peer paper ids per paper
                dataset = dataset.select(
                    self.paperId_col, self.peer_paperId_col).dropDuplicates()
                peers_per_paper = dataset.groupBy(self.paperId_col).agg(
                    F.collect_list(
                        self.peer_paperId_col).alias("peers_per_paper"))
            else:
                peers_per_paper = dataset.groupBy(
                    self.userId_col, self.paperId_col).agg(
                        F.collect_list(
                            self.peer_paperId_col).alias("peers_per_paper"))

            # generate 50/50 distribution to positive/negative class
            peers_per_paper = peers_per_paper.withColumn(
                "equally_distributed_papers",
                split_papers_udf("peers_per_paper"))
            # positive label 1
            # user_id | paper_id | peers_per_paper | equally_distributed_papers | positive_class_papers |
            positive_class_per_paper = peers_per_paper.withColumn(
                "positive_class_papers",
                F.col("equally_distributed_papers")[0])

            # user_id | paper_id | peer_paper_id
            if (self.model_training == "gm"):
                positive_class_per_paper = positive_class_per_paper.select(
                    self.paperId_col,
                    F.explode("positive_class_papers").alias(
                        self.peer_paperId_col))
            else:
                positive_class_per_paper = positive_class_per_paper.select(
                    self.userId_col, self.paperId_col,
                    F.explode("positive_class_papers").alias(
                        self.peer_paperId_col))

            # add lda paper representation to each paper based on its paper_id
            positive_class_dataset = self.vectorizer_model.transform(
                positive_class_per_paper)
            # get in which columns the result of the transform is stored
            former_paper_output_column = self.vectorizer_model.output_col
            former_papeId_column = self.vectorizer_model.paperId_col

            # add lda ids paper representation for peer papers
            self.vectorizer_model.setPaperIdCol("peer_paper_id")
            self.vectorizer_model.setOutputCol("peer_paper_lda_vector")

            # schema -> peer_paper_id | paper_id | user_id | citeulike_paper_id | lda_vector | peer_paper_lda_vector
            positive_class_dataset = self.vectorizer_model.transform(
                positive_class_dataset)

            # return the default columns of the paper profiles model, the model is ready for the training
            # of the next SVM model
            self.vectorizer_model.setPaperIdCol(former_papeId_column)
            self.vectorizer_model.setOutputCol(former_paper_output_column)

            # add the difference (paper_vector - peer_paper_vector) with label
            positive_class_dataset = positive_class_dataset.withColumn(
                self.output_col,
                vector_diff_udf(former_paper_output_column,
                                "peer_paper_lda_vector"))
            # add label 1
            positive_class_dataset = positive_class_dataset.withColumn(
                self.label_col, F.lit(1))

            # negative label 0
            negative_class_per_paper = peers_per_paper.withColumn(
                "negative_class_papers",
                F.col("equally_distributed_papers")[1])
            if (self.model_training == "gm"):
                negative_class_per_paper = negative_class_per_paper.select(
                    self.paperId_col,
                    F.explode("negative_class_papers").alias(
                        self.peer_paperId_col))
            else:
                negative_class_per_paper = negative_class_per_paper.select(
                    self.userId_col, self.paperId_col,
                    F.explode("negative_class_papers").alias(
                        self.peer_paperId_col))

            # add lda paper representation to each paper based on its paper_id
            negative_class_dataset = self.vectorizer_model.transform(
                negative_class_per_paper)
            # get in which columns the result of the transform is stored
            former_paper_output_column = self.vectorizer_model.output_col
            former_papeId_column = self.vectorizer_model.paperId_col

            # add lda ids paper representation for peer papers
            self.vectorizer_model.setPaperIdCol("peer_paper_id")
            self.vectorizer_model.setOutputCol("peer_paper_lda_vector")

            # schema -> peer_paper_id | paper_id | user_id | citeulike_paper_id | lda_vector | peer_paper_lda_vector
            negative_class_dataset = self.vectorizer_model.transform(
                negative_class_dataset)

            # return the default columns of the paper profiles model, the model is ready for the training
            # of the next SVM model
            self.vectorizer_model.setPaperIdCol(former_papeId_column)
            self.vectorizer_model.setOutputCol(former_paper_output_column)

            # add the difference (peer_paper_vector - paper_vector) with label 0
            negative_class_dataset = negative_class_dataset.withColumn(
                self.output_col,
                vector_diff_udf("peer_paper_lda_vector",
                                former_paper_output_column))
            # add label 0
            negative_class_dataset = negative_class_dataset.withColumn(
                self.label_col, F.lit(0))

            result = positive_class_dataset.union(negative_class_dataset)
        elif (self.pairs_generation == "dp"
              ):  #self.Pairs_Generation.DUPLICATED_PAIRS):

            # add lda paper representation to each paper based on its paper_id
            dataset = self.vectorizer_model.transform(dataset)
            # get in which columns the result of the transform is stored
            former_paper_output_column = self.vectorizer_model.output_col
            former_papeId_column = self.vectorizer_model.paperId_col

            # add lda ids paper representation for peer papers
            self.vectorizer_model.setPaperIdCol("peer_paper_id")
            self.vectorizer_model.setOutputCol("peer_paper_lda_vector")

            # schema -> peer_paper_id | paper_id | user_id ? | citeulike_paper_id | lda_vector | peer_paper_lda_vector
            dataset = self.vectorizer_model.transform(dataset)

            # return the default columns of the paper profiles model, the model is ready for the training
            # of the next SVM model
            self.vectorizer_model.setPaperIdCol(former_papeId_column)
            self.vectorizer_model.setOutputCol(former_paper_output_column)

            # add the difference (paper_vector - peer_paper_vector) with label 1
            positive_class_dataset = dataset.withColumn(
                self.output_col,
                vector_diff_udf(former_paper_output_column,
                                "peer_paper_lda_vector"))
            # add label 1
            positive_class_dataset = positive_class_dataset.withColumn(
                self.label_col, F.lit(1))
            # add the difference (peer_paper_vector - paper_vector) with label 0
            negative_class_dataset = dataset.withColumn(
                self.output_col,
                vector_diff_udf("peer_paper_lda_vector",
                                former_paper_output_column))
            # add label 0
            negative_class_dataset = negative_class_dataset.withColumn(
                self.label_col, F.lit(0))
            result = positive_class_dataset.union(negative_class_dataset)
        elif (self.pairs_generation == "ocp"
              ):  #self.Pairs_Generation.ONE_CLASS_PAIRS):

            # add lda paper representation to each paper based on its paper_id
            dataset = self.vectorizer_model.transform(dataset)
            # get in which columns the result of the transform is stored
            former_paper_output_column = self.vectorizer_model.output_col
            former_papeId_column = self.vectorizer_model.paperId_col

            # add lda ids paper representation for peer papers
            self.vectorizer_model.setPaperIdCol("peer_paper_id")
            self.vectorizer_model.setOutputCol("peer_paper_lda_vector")

            # schema -> peer_paper_id | paper_id | user_id ? | citeulike_paper_id | lda_vector | peer_paper_lda_vector
            dataset = self.vectorizer_model.transform(dataset)

            # return the default columns of the paper profiles model, the model is ready for the training
            # of the next SVM model
            self.vectorizer_model.setPaperIdCol(former_papeId_column)
            self.vectorizer_model.setOutputCol(former_paper_output_column)

            # add the difference (paper_vector - peer_paper_vector) with label 1
            result = dataset.withColumn(
                self.output_col,
                vector_diff_udf(former_paper_output_column,
                                "peer_paper_lda_vector"))
            # add label 1
            result = result.withColumn(self.label_col, F.lit(1))
        else:
            # throw an error - unsupported option
            raise ValueError('The option' + self.pairs_generation +
                             ' is not supported.')

        # drop lda vectors - not needed anymore
        result = result.drop("peer_paper_lda_vector",
                             former_paper_output_column)
        return result
    return SparseVector(len(dictionaryBigrams), vector_dict)


# In[52]:

# La ca devient le bordel, j'en ai chié pour arriver à appliquer une fonction à toute une colonne
# d'une dataframe. Contrairement à pandas, y a pas de fonction "apply", il faut recourir à des
# UserDefinedFunctions, et penser que le type sparseVector ne sera pas reconnu par la dataframe, qui
# n'est compatible qu'avec un nombre restreint de types

# EDIT : en fait je m'en suis pas rendu compte, mais cette manip je l'avais déjà faite pour la surcharge des
# tokenizer et postagger... les cinq dernières lignes à la fin avec udf et tout

from pyspark.sql.functions import UserDefinedFunction
from pyspark.mllib.linalg import VectorUDT
udfVectorizeUni = UserDefinedFunction(lambda x: vectorizeUni(x), VectorUDT())

# Une dataframe est un objet immutable, donc pas la peine d'essayer de modifier une colonne,
# à la place on crée une deuxième dataframe où on ajoute la colonne qu'on veut.
dfVect = dfBigram.withColumn("words", udfVectorizeUni("words"))
# On a bien remplacé ici du coup les mots par les vecteurs sparse
print "DataFrame(1-gram): On a bien remplacé ici du coup les mots par les vecteurs sparse"
dfVect.show()

udfVectorizeBi = UserDefinedFunction(lambda x: vectorizeBi(x), VectorUDT())
dfVect2 = dfVect.withColumn("bigrams", udfVectorizeBi("bigrams"))
print "DataFrame(bi-gram): On a bien remplacé ici du coup les mots par les vecteurs sparse"
dfVect2.show()

# Pour les opérations de traitement du langage, il est d'usage de normaliser (L2)
# les vecteurs de features : c'est ce qui marche le mieux apparemment.
Example #24
0
def f4():
    spark_builder = SparkSession.builder.appName(
        'PythonStreamingReceiverKafkaWordCount')
    spark_builder.config(
        'spark.jars.packages', ','.join([
            'org.apache.spark:spark-streaming-kafka-0-8_2.11:2.3.0',
            'org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.0',
            'mysql:mysql-connector-java:5.1.38'
        ]))

    spark_builder.config('spark.master', 'local[*]')

    url = "jdbc:mysql://oxumare.ctweb.inweb.org.br:33060/festival"
    properties = {"user": "******", "password": "******"}

    ss = spark_builder.getOrCreate()

    kafka_server = "oxumare:9092"
    topic_name = "tweets_ctb"

    stream = ss.readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", kafka_server) \
        .option("subscribe", topic_name) \
        .load()

    json_schema = StructType([
        StructField("_id", LongType()),
        StructField("created_at", TimestampType()),
        StructField("latitude", FloatType()),
        StructField("longitude", FloatType()),
        StructField("cell", IntegerType()),
        StructField("text", StringType()),
        StructField("user", StringType())
    ])

    tweets = stream.select(
        functions.from_json(stream.value.cast(
            StringType()), json_schema).alias('json')).select(
                functions.col('json.user').alias('user'),
                functions.col('json.text').alias('text'),
                functions.col('json.cell').alias('cell'),
                functions.col('json.latitude').alias('latitude'),
                functions.col('json.longitude').alias('longitude'),
                functions.col('json.created_at').alias('date'))

    tweets = remove_accents_punctuation(tweets)
    tweets = tokenize(tweets)
    tweets = remove_stop_words(ss, tweets)
    tweets = generate_n_grams(tweets)
    tweets = words_to_vector(tweets)
    tweets = tweets.drop('text_cleaned', 'words', 'words_stops_removed',
                         'n_grams')
    tweets, labels = feature_index(tweets)

    # tweets = tweets.select(['latitude', 'longitude', 'cell'])
    tweets.printSchema()

    actions = [
        apply_decision_tree_classifier, apply_logistic_regression_classifier,
        apply_naive_bayes_classifier
    ]

    for i, action in enumerate(actions):
        tweets = action(tweets) \
            .withColumnRenamed('probability', 'probability{}'.format(i)) \
            .drop('rawPrediction', 'prediction')

    tweets.printSchema()

    tweets = tweets.withColumn(
        'final_probability',
        functions.udf(average_probabilities, VectorUDT())(
            tweets['probability0'], tweets['probability1'],
            tweets['probability2'])).withColumn(
                'result',
                functions.udf(lambda x: labels[np.argmax(x)])(
                    'final_probability'))

    cols = [
        functions.udf(get_at_pos(i), FloatType())(
            tweets['final_probability']).alias(labels[i]) for i in range(3)
    ]

    tweets = tweets.select(
        ['date', 'text', 'cell', 'latitude', 'longitude', 'result'] + cols)

    tweets = tweets.groupBy(
        functions.window(
            functions.col('date'), "60 minutes", "30 minutes"),
        functions.col('cell')).agg(functions.avg('positive').alias('score'),
                                   functions.count('cell').alias('count')) \
        .orderBy('window')

    tweets = tweets.select(
        functions.to_json(functions.struct([tweets[x] for x in tweets.columns
                                            ])).alias("value"))

    # query = tweets.writeStream \
    #     .outputMode('complete') \
    #     .option("truncate", False) \
    #     .format('console') \
    #     .start()
    # query = tweets.writeStream \
    #     .outputMode('complete') \
    #     .option("truncate", False) \
    #     .format('console') \
    #     .trigger(processingTime='60 seconds') \
    #     .start()
    query = tweets.writeStream \
        .format("kafka").option("kafka.bootstrap.servers", kafka_server) \
        .outputMode('complete') \
        .option("topic", topic_name + "_result") \
        .option("checkpointLocation", "/data/checkpoint/1").start()

    query.awaitTermination()
Example #25
0
def get_euclidean_mfcc(vec1, vec2):
    mean1 = np.empty([13, 1])
    mean1 = vec1[0:13]
    cov1 = np.empty([13,13])
    cov1 = vec1[13:].reshape(13, 13)        
    mean2 = np.empty([13, 1])
    mean2 = vec2[0:13]
    cov2 = np.empty([13,13])
    cov2 = vec2[13:].reshape(13, 13)
    iu1 = np.triu_indices(13)
    #You need to pass the arrays as an iterable (a tuple or list), thus the correct syntax is np.concatenate((,),axis=None)
    div = distance.euclidean(np.concatenate((mean1, cov1[iu1]),axis=None), np.concatenate((mean2, cov2[iu1]),axis=None))
    return div

tic1 = int(round(time.time() * 1000))
list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())

#########################################################
#   Pre- Process RH and RP for Euclidean
#
rp = sc.textFile("features[0-9]*/out[0-9]*.rp")
rp = rp.map(lambda x: x.split(","))
kv_rp= rp.map(lambda x: (x[0].replace(";","").replace(".","").replace(",","").replace(" ",""), list(x[1:])))
rp_df = sqlContext.createDataFrame(kv_rp, ["id", "rp"])
rp_df = rp_df.select(rp_df["id"],list_to_vector_udf(rp_df["rp"]).alias("rp"))
rh = sc.textFile("features[0-9]*/out[0-9]*.rh")
rh = rh.map(lambda x: x.split(","))
kv_rh= rh.map(lambda x: (x[0].replace(";","").replace(".","").replace(",","").replace(" ",""), list(x[1:])))
rh_df = sqlContext.createDataFrame(kv_rh, ["id", "rh"])
rh_df = rh_df.select(rh_df["id"],list_to_vector_udf(rh_df["rh"]).alias("rh"))
Example #26
0
def main(argv=None):
    plot = ""
    if argv is None:
        inputs = sys.argv[1]
        if (len(sys.argv) > 2):
            plot = sys.argv[2]  # "plot" to show test RMSE plot

    conf = SparkConf().setAppName('matrix-factorization-recommend')
    sc = SparkContext(conf=conf)
    sqlCt = SQLContext(sc)

    #read train text file and prepare rating data (userID, movieID, rating)
    text = sqlCt.read.text(inputs + "/MovieLens100K_train.txt")
    train = text.map(lambda row: row.value.split("\t")) \
                .map(lambda l: (int(l[0]), int(l[1]), float(l[2]))) \
                .toDF(["userID", "movieID", "rating"])
    train.cache()

    #read test text file and prepare rating data (userID, movieID, rating)
    text = sqlCt.read.text(inputs + "/MovieLens100K_test.txt")
    test = text.map(lambda row: row.value.split("\t")) \
                .map(lambda l: (int(l[0]), int(l[1]), float(l[2]))) \
                .toDF(["userID", "movieID", "rating"])
    test.cache()

    #read movie names
    text = sqlCt.read.text(inputs + "/u.item")
    movie_names = text.map(lambda row: row.value.split("|")) \
                .map(lambda l: (int(l[0]), l[1])) \
                .toDF(["id", "movieName"])
    movie_names.cache()

    # Build the recommendation model using explicit ALS
    als = ALS(maxIter=20,
              userCol="userID",
              itemCol="movieID",
              ratingCol="rating")

    # List to store results:
    model_result = []
    cluster_result = []

    # Parameter grid for cross validation
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="rating",
                                    predictionCol="prediction")
    ranks = [2, 4, 8, 16, 32, 64, 128, 256]
    for rank in ranks:
        paramGrid = ParamGridBuilder() \
        .addGrid(als.rank, [rank]) \
        .build()

        # 5-fold cross validation
        crossval = CrossValidator(estimator=als,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=evaluator,
                                  numFolds=5)

        # Run cross-validation.
        model = crossval.fit(train)

        # RMSE on test data - filtering out new users who would not have any prediction
        prediction_test = model.transform(test).filter("prediction <> 'NaN'")
        rmse_test = evaluator.evaluate(prediction_test)
        model_result.append((rank, rmse_test))

        # K-mean clustering for items based on 50 factors
        item_factors = model.bestModel.itemFactors \
            .withColumn("features_vector", udf(lambda x: Vectors.dense(x),VectorUDT())("features")) \
            .cache()
        kmeans = KMeans(featuresCol="features_vector", predictionCol="cluster", \
                        initMode="random", k=50, seed = 1)
        model_kmeans = kmeans.fit(item_factors)
        item_clusters = model_kmeans.transform(item_factors)
        item_factors.unpersist()

        # Number of items small enough to collect
        two_clusters = item_clusters.filter("cluster < 2")   \
                .join(movie_names, on="id") \
                .select("cluster", "movieName") \
                .map(lambda row: (row[0],row[1])).collect()
        cluster1 = list(
            map(lambda x: x[1].encode("utf-8"),
                filter(lambda x: x[0] == 0, two_clusters)))
        cluster2 = list(
            map(lambda x: x[1].encode("utf-8"),
                filter(lambda x: x[0] == 1, two_clusters)))
        cluster_result.append((rank, (cluster1, cluster2)))

    # Show plot if run locally
    if (plot == "plot"):
        plotRMSE(model_result)

    # Print results
    print("MATRIX FACTORIZATION COLLABORATIVE FILTERING: ")
    for i in model_result:
        print("- Rank = %i: Test RMSE = %s" % (i[0], i[1]))

    print("\nTwo Clusters: ")
    for i in cluster_result:
        print("- Rank = %i:\n   Cluster-1: %s\n   Cluster-2: %s" \
              %(i[0], i[1][0], i[1][1]))
Example #27
0
baseDir = '/mnt/ml-class/'
irisFourFeatures = sqlContext.read.parquet(baseDir + 'irisFourFeatures.parquet')
print '\n'.join(map(repr, irisFourFeatures.take(2)))

# COMMAND ----------

# MAGIC %md
# MAGIC Convert the data from `SparseVector` to `DenseVector` types.

# COMMAND ----------

from pyspark.sql.functions import udf
from pyspark.mllib.linalg import Vectors, VectorUDT, DenseVector

sparseToDense = udf(lambda sv: Vectors.dense(sv.toArray()), VectorUDT())
irisDense = irisFourFeatures.select(sparseToDense('features').alias('features'), 'label')

print '\n'.join(map(repr, irisDense.take(2)))

# COMMAND ----------

# MAGIC %md
# MAGIC Save the new format for use in another notebook.

# COMMAND ----------

#irisDense.write.mode('overwrite').parquet('/tmp/irisDense.parquet')

# COMMAND ----------
Example #28
0
 def test_json_schema(self):
     self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt)
Example #29
0
    idxes.append(temporary[_][0])
  return Vectors.dense(idxes)

def get_top_1_topics_idx(topicDistribution,k,num_topics):
  array_dict = {}
  for i in range(0,num_topics):
    array_dict[i] = topicDistribution[i]
  temporary = sorted(array_dict.items(), key=operator.itemgetter(1), reverse = True)[0:k]
  #print(temporary)
  idxes = []
  for _ in range(0,k):
    idxes.append(temporary[_][0])
  return Vectors.dense(idxes)[0]

k = 3
sqlContext.registerFunction("get_top_k_topics_idx",udf(lambda x: get_top_k_topics_idx(x,k,num_topics), VectorUDT()))  
sqlContext.registerFunction("get_top_1_topics_idx",udf(lambda x: float(get_top_1_topics_idx(x,1,num_topics)), FloatType()))  

def extractTopics(transformed):
  transformed.createOrReplaceTempView("transformed")
  estrai_topic = sqlContext.sql("select *, get_top_k_topics_idx(topicDistribution) as topTopics,get_top_1_topics_idx(topicDistribution) as firstTopic  from transformed")
  #, get_top_k_topics_1(topicDistribution) as topTopics
  display(estrai_topic)
  estrai_topic.createOrReplaceTempView("extracted_transformed")
  return estrai_topic

# COMMAND ----------

transformed = pipeline_model.transform(freqItemsets)
new_transformed = extractTopics(transformed)
dislpay(new_transformed)
Example #30
0
 def test_json_schema(self):
     self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt)
display(fig)

# COMMAND ----------

# MAGIC %md
# MAGIC Prepare the data so that we have the sepal width as our target and a dense vector containing sepal length as our features.

# COMMAND ----------

from pyspark.sql.functions import udf, lit
from pyspark.sql.types import DoubleType
from pyspark.mllib.linalg import VectorUDT, Vectors

getElement = udf(lambda v, i: float(v[i]), DoubleType())
getElementAsVector = udf(lambda v, i: Vectors.dense([v[i]]), VectorUDT())

irisSepal = irisDense.select(
    getElement('features', lit(1)).alias('sepalWidth'),
    getElementAsVector('features', lit(0)).alias('features'))
irisSepal.cache()

display(irisSepal)

# COMMAND ----------

# MAGIC %md
# MAGIC #### Build a linear regression model

# COMMAND ----------