print('############################## - CLASSIFYING DATA') userRecommends = model.recommendForAllUsers(10) print('############################## - EXPLODING PREDICTIONS') flatUserRecomends = userRecommends.withColumn( 'userAndRatings', explode(userRecommends.recommendations)).select('userIndex', 'userAndRatings.*') print('############################## - CONVERTING INDEXES TO STRING') userConverter = IndexToString(inputCol='userIndex', outputCol='userId', labels=indexer_acc_fitted.labels) itemConverter = IndexToString(inputCol='itemIndex', outputCol='itemId', labels=indexer_mer_fitted.labels) convertedMoviesRecs = Pipeline( stages=[userConverter, itemConverter]).fit(df).transform(flatUserRecomends) print('############################## - SAVING DATA') df.unpersist() convertedMoviesRecs.cache() convertedMoviesRecs.show() convertedMoviesRecs.write.json('/ML/movies/usersrec/') # spark-submit als-model-predictions.py --master yarn --deploy-mode client --num-executors 3 --driver-java-options "-XX:+UseG1GC -XX:ResizePLAB -Xms1g -Xmx1g -XX:InitiatingHeapOccupancyPercent=35" --conf "spark.sql.tungthen.enabled=true" --conf "spark.serializer=org.apache.spark.serializer.KyrioSerializer" --conf "spark.memory.fraction=0.9" --conf "spark.driver.memoryOverhead=1g" --conf "spark.executor.memoryOverhead=1g" --conf "spark.executor.extraJavaOptions -XX:+UseG1GC -XX:ResizePLAB -Xms3g -Xmx3g -XX:InitiatingHeapOccupancyPercent=35 -XX:ConcGCThread=20" --conf "spark.scheduler.mode=FAIR"
wordLength = "wordLength" wordCount = "wordCount" wordLengthTransformer = UDFTransformer(inputCol="text", outputCol=wordLength, udf=wordLengthUDF) wordCountTransformer = UDFTransformer(inputCol="text", outputCol=wordCount, udf=wordCountUDF) # COMMAND ---------- from pyspark.ml import Pipeline data = Pipeline(stages=[wordLengthTransformer, wordCountTransformer]) \ .fit(rawData).transform(rawData) \ .withColumn("label", rawData["rating"] > 3).drop("rating") # COMMAND ---------- data.show(5) # COMMAND ---------- # MAGIC %md ### 4a. Classify using pyspark # MAGIC # MAGIC To choose the best LogisticRegression classifier using the `pyspark` # MAGIC library, need to *explictly* perform the following steps: # MAGIC # MAGIC 1. Process the features: # MAGIC * Tokenize the text column # MAGIC * Hash the tokenized column into a vector using hashing # MAGIC * Merge the numeric features with the vector in the step above # MAGIC 2. Process the label column: cast it into the proper type. # MAGIC 3. Train multiple LogisticRegression algorithms on the `train` dataset # MAGIC with different hyperparameters