def test_vector_size_hint(self): df = self.spark.createDataFrame( [(0, Vectors.dense([0.0, 10.0, 0.5])), (1, Vectors.dense([1.0, 11.0, 0.5, 0.6])), (2, Vectors.dense([2.0, 12.0]))], ["id", "vector"]) sizeHint = VectorSizeHint( inputCol="vector", handleInvalid="skip") sizeHint.setSize(3) self.assertEqual(sizeHint.getSize(), 3) output = sizeHint.transform(df).head().vector expected = DenseVector([0.0, 10.0, 0.5]) self.assertEqual(output, expected)
def test_vector_size_hint(self): df = self.spark.createDataFrame( [(0, Vectors.dense([0.0, 10.0, 0.5])), (1, Vectors.dense([1.0, 11.0, 0.5, 0.6])), (2, Vectors.dense([2.0, 12.0]))], ["id", "vector"]) sizeHint = VectorSizeHint(inputCol="vector", handleInvalid="skip") sizeHint.setSize(3) self.assertEqual(sizeHint.getSize(), 3) output = sizeHint.transform(df).head().vector expected = DenseVector([0.0, 10.0, 0.5]) self.assertEqual(output, expected)
) output.select("features", "clicked").show(truncate=False) # COMMAND ---------- ###Vector size hint takes the size of the input dataframe and transform the vector size with the given size hint from pyspark.ml.linalg import Vectors from pyspark.ml.feature import (VectorSizeHint, VectorAssembler) dataset = spark.createDataFrame( [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0), (0, 18, 1.0, Vectors.dense([0.0, 10.0]), 0.0)], ["id", "hour", "mobile", "userFeatures", "clicked"]) sizeHint = VectorSizeHint(inputCol="userFeatures", handleInvalid="skip", size=3) datasetWithSize = sizeHint.transform(dataset) print("Rows where 'userFeatures' is not the right size are filtered out") datasetWithSize.show(truncate=False) assembler = VectorAssembler(inputCols=["hour", "mobile", "userFeatures"], outputCol="features") # This dataframe can be used by downstream transformers as before output = assembler.transform(datasetWithSize) print( "Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'" ) output.select("features", "clicked").show(truncate=False)
from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("VectorSizeHintExample")\ .getOrCreate() # $example on$ dataset = spark.createDataFrame( [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0), (0, 18, 1.0, Vectors.dense([0.0, 10.0]), 0.0)], ["id", "hour", "mobile", "userFeatures", "clicked"]) sizeHint = VectorSizeHint( inputCol="userFeatures", handleInvalid="skip", size=3) datasetWithSize = sizeHint.transform(dataset) print("Rows where 'userFeatures' is not the right size are filtered out") datasetWithSize.show(truncate=False) assembler = VectorAssembler( inputCols=["hour", "mobile", "userFeatures"], outputCol="features") # This dataframe can be used by downstream transformers as before output = assembler.transform(datasetWithSize) print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'") output.select("features", "clicked").show(truncate=False) # $example off$
def main(input_dir, output_dir): # main logic starts here df_schema = types.StructType([ types.StructField('title_clean', types.StringType()), types.StructField('polarity_subjectivity', types.ArrayType(types.FloatType())), types.StructField('score', types.LongType()), types.StructField('num_comments', types.LongType()), ]) headlines_df = spark.read.json(input_dir, encoding='utf-8', schema=df_schema).repartition(80) split_sentiment_df = headlines_df.withColumn( 'polarity', functions.element_at(headlines_df['polarity_subjectivity'], 1)).withColumn( 'subjectivity', functions.element_at( headlines_df['polarity_subjectivity'], 2)) df_sentiment = split_sentiment_df.withColumn( 'label', get_label(split_sentiment_df['polarity'])) training_set, validation_set = df_sentiment.randomSplit([0.75, 0.25]) headline_vector_size = 3 word_freq_vector_size = 100 tokenizer = Tokenizer(inputCol='title_clean', outputCol='words') headline2Vector = Word2Vec(vectorSize=headline_vector_size, minCount=0, inputCol='words', outputCol='headline_vector') hashingTF = HashingTF(inputCol='words', outputCol='word_counts', numFeatures=word_freq_vector_size) idf = IDF(inputCol='word_counts', outputCol='word_frequecy', minDocFreq=5) headline_vector_size_hint = VectorSizeHint( inputCol='headline_vector', size=headline_vector_size) #need this for streaming word_freq_vector_size_hint = VectorSizeHint( inputCol='word_frequecy', size=word_freq_vector_size) #need this for streaming feature_assembler = VectorAssembler(inputCols=[ 'headline_vector', 'score', 'num_comments', 'subjectivity', 'word_frequecy' ], outputCol='features') dt_classifier = DecisionTreeClassifier(featuresCol='features', labelCol='label', predictionCol='prediction', maxDepth=9) pipeline = Pipeline(stages=[ tokenizer, headline2Vector, hashingTF, idf, headline_vector_size_hint, word_freq_vector_size_hint, feature_assembler, dt_classifier ]) sentiment_model = pipeline.fit(training_set) validation_predictions = sentiment_model.transform(validation_set) evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label') validation_score = evaluator.evaluate(validation_predictions) print('Validation score for Sentiment model F1: %g' % (validation_score, )) validation_score_accuracy = evaluator.evaluate( validation_predictions, {evaluator.metricName: "accuracy"}) print('Validation score for Sentiment model Accuracy: %g' % (validation_score_accuracy, )) sentiment_model.write().overwrite().save(output_dir)
# In[10]: # Execute this cell # When using MLlib with structured streaming, VectorAssembler has # some limitations in a streaming context. Specifically, VectorAssembler # can only work on Vector columns of known size. To address this issue we # can explicitly specify the size of the pcaVector column so that we'll # be be able to use our pipeline with structured streaming. To do this # we'll use the VectorSizeHint transformer. from pyspark.ml.feature import VectorSizeHint # In[11]: # Question 7. Use VectorSizeHint() with inputCol="pcaVector", size=28. vectorSizeHint = VectorSizeHint(inputCol="pcaVector", size=28) # In[12]: # Execute this cell from pyspark.ml import Pipeline from pyspark.sql.functions import col # In[13]: # Question 8. Create a Pipeline() and include the stages equal to a # list of oneHot, vectorSizeHint, vectorAssembler, estimator. Save in # a variable named pipeline. pipeline = Pipeline( stages=[oneHot, vectorSizeHint, vectorAssembler, estimator])
# Word2Vec dataset = dataset.withColumn( 'categorical', F.concat(F.array('rat'), F.array('mcc'), F.array('mnc'), F.array('msin'), F.array('tac'), F.array('snr'))) word2Vec_output_path = "{}/data/word2VecModel.bin".format(base_path) word2Vec = Word2VecModel.load(word2Vec_output_path) dataset = word2Vec.transform(dataset) # VectorAssembler sizeHint = VectorSizeHint(inputCol="vcategorical", handleInvalid="skip", size=50) dataset = sizeHint.transform(dataset) vector_assembler_output_path = "{}/data/vectorAssemblerW2VModel.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_output_path) dataset = vector_assembler.transform(dataset) # Clasificación model_path = "{}/data/distanceKmeansRmW2VModel.bin".format(base_path) model = KMeansModel.load(model_path) predictions = model.transform(dataset) centers = model.clusterCenters()
heros_to_lineup_udf = udf(onehot, VectorUDT()) return df.withColumn("dire_lineup_vec", heros_to_lineup_udf(df.dire_lineup))\ .withColumn("radiant_lineup_vec", heros_to_lineup_udf(df.radiant_lineup)) df = convert_heroes_to_lineup(df) def convert_types(df: DataFrame) -> DataFrame: return df.withColumn("radiant_win_int", df.radiant_win.cast(IntegerType())) df = convert_types(df) from pyspark.ml.pipeline import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import VectorAssembler, VectorSizeHint size_hint_dire = VectorSizeHint(inputCol="dire_lineup_vec", size=len(heroes_dict), handleInvalid="skip") size_hint_radiant = VectorSizeHint(inputCol="radiant_lineup_vec", size=len(heroes_dict), handleInvalid="skip") vec_assembler = VectorAssembler(inputCols=['dire_lineup_vec', 'radiant_lineup_vec'], outputCol="features") regression = LogisticRegression(featuresCol="features", labelCol="radiant_win_int") pipeline = Pipeline(stages=[size_hint_dire, size_hint_radiant, vec_assembler, regression]) traint_df, test_df = df.randomSplit([0.8, 0.2]) model = pipeline.fit(df) result_df = model.transform(test_df) test_accuracy = result_df.filter(col("radiant_win").eqNullSafe(col("prediction"))).count()/result_df.count() model.save("model")