Example #1
0
    def test_vector_size_hint(self):
        df = self.spark.createDataFrame(
            [(0, Vectors.dense([0.0, 10.0, 0.5])),
             (1, Vectors.dense([1.0, 11.0, 0.5, 0.6])),
             (2, Vectors.dense([2.0, 12.0]))],
            ["id", "vector"])

        sizeHint = VectorSizeHint(
            inputCol="vector",
            handleInvalid="skip")
        sizeHint.setSize(3)
        self.assertEqual(sizeHint.getSize(), 3)

        output = sizeHint.transform(df).head().vector
        expected = DenseVector([0.0, 10.0, 0.5])
        self.assertEqual(output, expected)
Example #2
0
    def test_vector_size_hint(self):
        df = self.spark.createDataFrame(
            [(0, Vectors.dense([0.0, 10.0, 0.5])),
             (1, Vectors.dense([1.0, 11.0, 0.5, 0.6])),
             (2, Vectors.dense([2.0, 12.0]))], ["id", "vector"])

        sizeHint = VectorSizeHint(inputCol="vector", handleInvalid="skip")
        sizeHint.setSize(3)
        self.assertEqual(sizeHint.getSize(), 3)

        output = sizeHint.transform(df).head().vector
        expected = DenseVector([0.0, 10.0, 0.5])
        self.assertEqual(output, expected)
)
output.select("features", "clicked").show(truncate=False)

# COMMAND ----------

###Vector size hint takes the size of the input dataframe and transform the vector size with the given size hint
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import (VectorSizeHint, VectorAssembler)

dataset = spark.createDataFrame(
    [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0),
     (0, 18, 1.0, Vectors.dense([0.0, 10.0]), 0.0)],
    ["id", "hour", "mobile", "userFeatures", "clicked"])

sizeHint = VectorSizeHint(inputCol="userFeatures",
                          handleInvalid="skip",
                          size=3)

datasetWithSize = sizeHint.transform(dataset)
print("Rows where 'userFeatures' is not the right size are filtered out")
datasetWithSize.show(truncate=False)

assembler = VectorAssembler(inputCols=["hour", "mobile", "userFeatures"],
                            outputCol="features")

# This dataframe can be used by downstream transformers as before
output = assembler.transform(datasetWithSize)
print(
    "Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'"
)
output.select("features", "clicked").show(truncate=False)
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("VectorSizeHintExample")\
        .getOrCreate()

    # $example on$
    dataset = spark.createDataFrame(
        [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0),
         (0, 18, 1.0, Vectors.dense([0.0, 10.0]), 0.0)],
        ["id", "hour", "mobile", "userFeatures", "clicked"])

    sizeHint = VectorSizeHint(
        inputCol="userFeatures",
        handleInvalid="skip",
        size=3)

    datasetWithSize = sizeHint.transform(dataset)
    print("Rows where 'userFeatures' is not the right size are filtered out")
    datasetWithSize.show(truncate=False)

    assembler = VectorAssembler(
        inputCols=["hour", "mobile", "userFeatures"],
        outputCol="features")

    # This dataframe can be used by downstream transformers as before
    output = assembler.transform(datasetWithSize)
    print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
    output.select("features", "clicked").show(truncate=False)
    # $example off$
Example #5
0
def main(input_dir, output_dir):
    # main logic starts here
    df_schema = types.StructType([
        types.StructField('title_clean', types.StringType()),
        types.StructField('polarity_subjectivity',
                          types.ArrayType(types.FloatType())),
        types.StructField('score', types.LongType()),
        types.StructField('num_comments', types.LongType()),
    ])

    headlines_df = spark.read.json(input_dir,
                                   encoding='utf-8',
                                   schema=df_schema).repartition(80)
    split_sentiment_df = headlines_df.withColumn(
        'polarity',
        functions.element_at(headlines_df['polarity_subjectivity'],
                             1)).withColumn(
                                 'subjectivity',
                                 functions.element_at(
                                     headlines_df['polarity_subjectivity'], 2))

    df_sentiment = split_sentiment_df.withColumn(
        'label', get_label(split_sentiment_df['polarity']))

    training_set, validation_set = df_sentiment.randomSplit([0.75, 0.25])

    headline_vector_size = 3
    word_freq_vector_size = 100

    tokenizer = Tokenizer(inputCol='title_clean', outputCol='words')
    headline2Vector = Word2Vec(vectorSize=headline_vector_size,
                               minCount=0,
                               inputCol='words',
                               outputCol='headline_vector')
    hashingTF = HashingTF(inputCol='words',
                          outputCol='word_counts',
                          numFeatures=word_freq_vector_size)
    idf = IDF(inputCol='word_counts', outputCol='word_frequecy', minDocFreq=5)
    headline_vector_size_hint = VectorSizeHint(
        inputCol='headline_vector',
        size=headline_vector_size)  #need this for streaming
    word_freq_vector_size_hint = VectorSizeHint(
        inputCol='word_frequecy',
        size=word_freq_vector_size)  #need this for streaming
    feature_assembler = VectorAssembler(inputCols=[
        'headline_vector', 'score', 'num_comments', 'subjectivity',
        'word_frequecy'
    ],
                                        outputCol='features')
    dt_classifier = DecisionTreeClassifier(featuresCol='features',
                                           labelCol='label',
                                           predictionCol='prediction',
                                           maxDepth=9)

    pipeline = Pipeline(stages=[
        tokenizer, headline2Vector, hashingTF, idf, headline_vector_size_hint,
        word_freq_vector_size_hint, feature_assembler, dt_classifier
    ])
    sentiment_model = pipeline.fit(training_set)

    validation_predictions = sentiment_model.transform(validation_set)

    evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                                  labelCol='label')
    validation_score = evaluator.evaluate(validation_predictions)
    print('Validation score for Sentiment model F1: %g' % (validation_score, ))

    validation_score_accuracy = evaluator.evaluate(
        validation_predictions, {evaluator.metricName: "accuracy"})
    print('Validation score for Sentiment model Accuracy: %g' %
          (validation_score_accuracy, ))

    sentiment_model.write().overwrite().save(output_dir)
# In[10]:

# Execute this cell
# When using MLlib with structured streaming, VectorAssembler has
# some limitations in a streaming context. Specifically, VectorAssembler
# can only work on Vector columns of known size. To address this issue we
# can explicitly specify the size of the pcaVector column so that we'll
# be be able to use our pipeline with structured streaming. To do this
# we'll use the VectorSizeHint transformer.

from pyspark.ml.feature import VectorSizeHint

# In[11]:

# Question 7. Use VectorSizeHint() with inputCol="pcaVector", size=28.
vectorSizeHint = VectorSizeHint(inputCol="pcaVector", size=28)

# In[12]:

# Execute this cell
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

# In[13]:

# Question 8. Create a Pipeline() and include the stages equal to a
# list of oneHot, vectorSizeHint, vectorAssembler, estimator. Save in
# a variable named pipeline.
pipeline = Pipeline(
    stages=[oneHot, vectorSizeHint, vectorAssembler, estimator])
Example #7
0
# Word2Vec

dataset = dataset.withColumn(
    'categorical',
    F.concat(F.array('rat'), F.array('mcc'), F.array('mnc'), F.array('msin'),
             F.array('tac'), F.array('snr')))

word2Vec_output_path = "{}/data/word2VecModel.bin".format(base_path)
word2Vec = Word2VecModel.load(word2Vec_output_path)
dataset = word2Vec.transform(dataset)

# VectorAssembler

sizeHint = VectorSizeHint(inputCol="vcategorical",
                          handleInvalid="skip",
                          size=50)
dataset = sizeHint.transform(dataset)

vector_assembler_output_path = "{}/data/vectorAssemblerW2VModel.bin".format(
    base_path)
vector_assembler = VectorAssembler.load(vector_assembler_output_path)
dataset = vector_assembler.transform(dataset)

# Clasificación

model_path = "{}/data/distanceKmeansRmW2VModel.bin".format(base_path)
model = KMeansModel.load(model_path)
predictions = model.transform(dataset)

centers = model.clusterCenters()
Example #8
0
    heros_to_lineup_udf = udf(onehot, VectorUDT())
    return df.withColumn("dire_lineup_vec", heros_to_lineup_udf(df.dire_lineup))\
             .withColumn("radiant_lineup_vec", heros_to_lineup_udf(df.radiant_lineup))

df = convert_heroes_to_lineup(df)

def convert_types(df: DataFrame) -> DataFrame:
    return df.withColumn("radiant_win_int", df.radiant_win.cast(IntegerType()))

df = convert_types(df)

from pyspark.ml.pipeline import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler, VectorSizeHint

size_hint_dire = VectorSizeHint(inputCol="dire_lineup_vec", size=len(heroes_dict), handleInvalid="skip")
size_hint_radiant = VectorSizeHint(inputCol="radiant_lineup_vec", size=len(heroes_dict), handleInvalid="skip")
vec_assembler = VectorAssembler(inputCols=['dire_lineup_vec', 'radiant_lineup_vec'], outputCol="features")
regression = LogisticRegression(featuresCol="features", labelCol="radiant_win_int")
pipeline = Pipeline(stages=[size_hint_dire, size_hint_radiant, vec_assembler, regression])

traint_df, test_df = df.randomSplit([0.8, 0.2])
model = pipeline.fit(df)

result_df = model.transform(test_df)

test_accuracy = result_df.filter(col("radiant_win").eqNullSafe(col("prediction"))).count()/result_df.count()

model.save("model")