# only keep tweets in english
    tweets_df = tweets_df.filter(tweets_df.lang == "en")
    tweets_df = tweets_df.withColumn(
        "cleaned_tweets",
        regexp_replace(col("tweets"), "http.+|@.|\n|RT|\d+", ' '))
    # All words are lowercase and tokenized
    tweets_df = RegexTokenizer(inputCol="cleaned_tweets",
                               outputCol="lowercase_tweets",
                               pattern="\\W").transform(tweets_df)
    # We remove the StopWords
    tweets_df = StopWordsRemover(
        inputCol="lowercase_tweets",
        outputCol="processed_tweets").transform(tweets_df)
    # We drop the unused columns
    tweets_df = tweets_df.drop("cleaned_tweets", "lowercase_tweets", "lang",
                               "date")
    # We load the language model
    model_path = "s3://" + bucket_name + "/models/w2v_model"
    loaded_model = Word2VecModel.load(model_path)
    # We add the output columns : it is the average of the words' vectors for each tweet
    tweets_df = loaded_model.transform(tweets_df)

    # We load the classifier
    clf_path = "s3://" + bucket_name + "/models/mpc_model"
    loaded_clf = MultilayerPerceptronClassificationModel.load(clf_path)
    predictions = loaded_clf.transform(tweets_df)

    # We keep the probability only for the predicted sentiment
    to_array = udf(lambda v: v.toArray().tolist(), ArrayType(FloatType()))
    predictions = predictions.withColumn("probability",
                                         to_array("probability"))
Exemple #2
0
        path   = "hdfs:/input/prices.csv",
        header = False,
        schema = StructType([StructField("asin", StringType(), True),
                             StructField("price", FloatType(), True)]))

    reviews = reviews.join(prices, ["asin"], how="leftsemi")

    # # Use nltk.word_tokenizer to tokenize words
    # @udf(ArrayType(StringType()))
    # def tokenize(string):
    #     return word_tokenize(string)

    # reviews = reviews.withColumn("words", tokenize("reviewText"))

    reviews = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W").transform(reviews)
    reviews = reviews.drop("reviewText")

    reviews = reviews.withColumn("num_words", size("words"))
    reviews = reviews.drop("words")

    reviews = reviews.groupBy("asin").agg(avg("num_words").alias("average_review_length"))
    reviews = reviews.drop("num_words")

    data = reviews.join(prices, ["asin"])
    data = data.drop("asin")
    data = data.repartition(20)

    xy = data.rdd.map(lambda row: (row.average_review_length, row.price))
    xy = xy.coalesce(8)
    x = xy.map(lambda v: v[0])
    y = xy.map(lambda v: v[1])
Exemple #3
0
        escape = "\"",
        schema = StructType([StructField("reviewId", IntegerType(), True),
                             StructField("asin", StringType(), True),
                             StructField("reviewText", StringType(), True)]))
    df = df.drop("asin")
    df = df.repartition(20)

    # # Use nltk.word_tokenizer to tokenize words
    # @udf(ArrayType(StringType()))
    # def tokenize(string):
    #     return word_tokenize(string)

    # df = df.withColumn("words", tokenize("reviewText"))

    df = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W").transform(df)
    df = df.drop("reviewText")

    cv_model = CountVectorizer(inputCol="words", outputCol="tf").fit(df)
    vocabulary = cv_model.vocabulary

    df = cv_model.transform(df)
    df = df.drop("words")
    df.cache()

    df = IDF(inputCol="tf", outputCol="tfidf").fit(df).transform(df)
    df = df.drop("tf")
    df.unpersist()

    @udf(MapType(StringType(), FloatType()))
    def create_map(vector):
        zipped = zip(vector.indices, vector.values)