Example #1
0
 def test_stopwordsremover(self):
     dataset = self.spark.createDataFrame([Row(input=["a", "panda"])])
     stopWordRemover = StopWordsRemover(inputCol="input", outputCol="output")
     # Default
     self.assertEqual(stopWordRemover.getInputCol(), "input")
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["panda"])
     self.assertEqual(type(stopWordRemover.getStopWords()), list)
     self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], basestring))
     # Custom
     stopwords = ["panda"]
     stopWordRemover.setStopWords(stopwords)
     self.assertEqual(stopWordRemover.getInputCol(), "input")
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["a"])
     # with language selection
     stopwords = StopWordsRemover.loadDefaultStopWords("turkish")
     dataset = self.spark.createDataFrame([Row(input=["acaba", "ama", "biri"])])
     stopWordRemover.setStopWords(stopwords)
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, [])
     # with locale
     stopwords = ["BELKİ"]
     dataset = self.spark.createDataFrame([Row(input=["belki"])])
     stopWordRemover.setStopWords(stopwords).setLocale("tr")
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, [])
def main(sc):
    sql_context = SQLContext(sc)
    all_data = get_all_data()

    # Input data: Each row is a bag of words from a sentence or document.
    training_data = [(id_gen.next(), text.split(" ")) for text in all_data]
    documentdf = sql_context.createDataFrame(training_data, ["id", "text"])

    remover = StopWordsRemover(inputCol="text", outputCol="text_filtered")
    cleaned_document = remover.transform(documentdf)

    # Learn a mapping from words to Vectors.
    word2vec = Word2Vec(vectorSize=len(training_data),
                        inputCol="text_filtered",
                        outputCol="result")
    model = word2vec.fit(cleaned_document)
    matrix = column_similarities(model.transform(cleaned_document))

    # We use the size of the target data to filter only
    # products of target data to filter data and avoid
    # products of taret data to itself
    values = matrix.entries.filter(
        lambda x: x.j >= TARGET_DATA_SIZE and x.i < TARGET_DATA_SIZE).sortBy(
        keyfunc=lambda x: x.value, ascending=False).map(
        lambda x: x.j).distinct().take(100)

    training_data_index = dict(training_data)
    for position, item in enumerate(values):
        line = " ".join(training_data_index[int(item)])
        print('%d -> %s' % (position, line.encode('utf-8')))
Example #3
0
def remove_stop_words(p_df, in_column, out_column):
    """
    Removes stop words from a column in a DataFrame. The column must be a list of words.
    :param p_df: A DataFrame.
    :param in_column: Name of the input column.
    :param out_column: Name of the output column.
    :return: A DataFrame.
    """    
    remover = StopWordsRemover(inputCol=in_column, outputCol=out_column)
    return remover.transform(p_df)
Example #4
0
    def process(reviews):
        if (reviews.isEmpty()):
            pass
        else:
            start = time.time()
            #get reviews with overall rating > 3 and overall rating < 3
            pos_reviews = reviews.filter(lambda x: x[0] > 3.0)
            neg_reviews = reviews.filter(lambda x: x[0] < 3.0)
            #set label for each class. 0.0 is positive - 1.0 is negative
            review_labels = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0)

            Words = Row('label', 'words')
            words = reviews.map(lambda r: Words(*r))
            words_df = spark.createDataFrame(words)

            #reviews tokenization
            token = RegexTokenizer(minTokenLength=2,
                                   pattern="[^A-Za-z]+",
                                   inputCol="words",
                                   outputCol="token",
                                   toLowercase=True)
            token_filtered = token.transform(words_df)

            #stopwords elimination
            remover = StopWordsRemover(inputCol="token",
                                       outputCol="stopwords",
                                       caseSensitive=False)
            stopwords_filtered = remover.transform(token_filtered)

            prep_filtered = (
                stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0])

            #tf-idf calculation
            tf = HashingTF(numFeatures=numFeatures).transform(
                prep_filtered.map(porter_stem, preservesPartitioning=True))
            idf = IDF().fit(tf)
            train_tfidf = idf.transform(tf)

            #set training dataset with label
            training = review_labels.zip(train_tfidf).map(
                lambda x: LabeledPoint(x[0], x[1]))

            #train the model classifier
            model = NaiveBayes.train(training)
            #save model classifier to HDFS
            output_dir = "hdfs://VM10-1-0-14:9000/classifier/" + model_name
            model.save(sc, output_dir)
            end = time.time()

            print("Total Reviews : ", reviews.count(), "Processing Time : ",
                  (end - start))

            ssc.stop()
Example #5
0
 def preprocessDF(self):
     reTokenizer = RegexTokenizer(pattern=r'\W+', inputCol='reformat2', outputCol='tokenKey', toLowercase=True)
     df_token = reTokenizer.transform(self.df)
     remover = StopWordsRemover(inputCol='tokenKey', outputCol='tokens')
     remover.setStopWords(list(self.stopWords))
     df_token = remover.transform(df_token)
     
     df_token=df_token.select('categories','created_at','favorite_count','quote_count','reply_count','retweet_count','user.followers_count',
     'user.favourites_count','user.friends_count','tokens')
     
     self.df1.write.json(os.path.join(bdenv_loc,'twitter_parse_reformat_2018_01-06.json'), mode='append')
     df_token.write.json(os.path.join(bdenv_loc,'twitter_parse_tokens_2018_01-06.json'), mode='append')
 def stopWordsRemover(self, dataset, colName):
     stopWordsList = StopWords.stopWordsKNIME
     sentimentStopWordRemover = StopWordsRemover(
         inputCol=colName,
         outputCol=self.dmxStopWords,
         stopWords=stopWordsList)
     dataset = sentimentStopWordRemover.transform(dataset)
     textProcessing = TextProcessing()
     dataset = textProcessing.stemming(dataset, pc.DMXSTOPWORDS)
     dataset = textProcessing.ngrams(dataset, pc.DMXSTOPWORDS, 2)
     dataset = textProcessing.lemmatization(dataset, pc.DMXSTOPWORDS)
     return dataset
Example #7
0
def functions_for_deal_with_texts(spark, resources_folder):
    send_df = spark.createDataFrame([
        (0, 'Hi I heard about Spark'),
        (1, 'I wish java could use case classes'),
        (2, 'Logistic,regression,models,are,neat'),
    ], ['id', 'sentence'])

    tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
    regularTokenizer = RegexTokenizer(
        inputCol='sentence',
        outputCol='words',
        pattern='\\W')
    count_token = udf(lambda words: len(words), IntegerType())
    tokenize = tokenizer.transform(send_df)
    tokenize.show()
    tokenize.withColumn('tokens', count_token(col('words'))).show()

    rg_tokenize = regularTokenizer.transform(send_df)
    rg_tokenize.show()
    rg_tokenize.withColumn('tokens', count_token(col('words'))).show()

    # remover palabras comunes
    sentenceData = spark.createDataFrame([
        (0, ["I", "saw", "the", "red", "balloon"]),
        (1, ["Mary", "had", "a", "little", "lamb"])
    ], ["id", "raw"])

    remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
    remover.transform(sentenceData).show(truncate=False)

    wordDataFrame = spark.createDataFrame([
        (0, ["Hi", "I", "heard", "about", "Spark"]),
        (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
        (2, ["Logistic", "regression", "models", "are", "neat"])
    ], ["id", "words"])

    ngram = NGram(n=2, inputCol="words", outputCol="ngrams")

    ngramDataFrame = ngram.transform(wordDataFrame)
    ngramDataFrame.select("ngrams").show(truncate=False)
Example #8
0
def kmeans(params):
    path = params[0]
    k = int(params[1])
    iterations = int(params[2])
    target_dir = params[3]

    try:
        # Creating session
        spark_session = SparkSession.builder.appName(
            "project4-jwj").getOrCreate()

        # loading the files from hdfs ang getting a DataFrame
        data = spark_session.read.format("csv").option("header", "true").load(
            "{}/*.csv".format(path))
        #data.show()
        # Getting column's name
        columns = data.columns
        # Removing null rows
        for i in columns:
            data = data.filter(col(i).isNotNull())

        # Breaking the content column into individual words
        tokenizer = Tokenizer(inputCol="content", outputCol="Words")
        tokenized = tokenizer.transform(data)
        #tokenized.show()
        # Removing stop words
        remover = StopWordsRemover(inputCol="Words", outputCol="Filtered")
        removed = remover.transform(tokenized)
        #removed.show()

        # Term frecuency - inverse document frecuency
        hashingTF = HashingTF(inputCol="Filtered",
                              outputCol="rawFeatures",
                              numFeatures=3000)

        # Getting the frecuency term vector to try to get k and train kmeans
        featurizedData = hashingTF.transform(removed)
        #featurizedData.show()

        idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)
        idfModel = idf.fit(featurizedData)
        rescaledData = idfModel.transform(featurizedData)
        rescaledData.show()
        # Train KMeans
        kmean = KMeans().setK(k).setMaxIter(iterations).fit(rescaledData)
        clustersTable = kmean.transform(rescaledData)
        clustersTable.show()
        clustersTable.select("title", "prediction").repartition(
            1).write.format("com.databricks.spark.csv").save(target_dir)
    except Exception as e:
        print(str(e), file=sys.stderr)
        sys.exit(1)
Example #9
0
def task_four(ngram):
    """
    Set the ngram value
    :param ngram:
    :return:
    """
    params = list(inspect.getargspec(task_four))
    p = list(chain.from_iterable([i for i in params if i is not None]))
    param_values = {}
    if len(p) > 0:
        for i, v in enumerate(p):
            try:
                value = raw_input("Please enter a value for {} ==> ".format(v))
                param_values.update({v: value})
            except:
                pass
    ngram = param_values.get(p[0])

    if int(ngram) == 2:
        # --- list of stopwords
        stopwords = {
            'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
            'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his',
            'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
            'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
            'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
            'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
            'having', 'do', 'does', 'did', 'doing', 'an', 'the', 'and', 'but',
            'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by',
            'for', 'with', 'about', 'against', 'between', 'into', 'through',
            'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up',
            'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
            'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
            'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most',
            'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same',
            'so', 'than', 'too', 'very', 'can', 'will', 'just', 'don',
            'should', 'now', ' a ', 'insured', 'sured', 'coverage', 'year',
            'dob', 'insd', 'left'
        }

        # --- remove stop words
        REMOVER = StopWordsRemover()
        stopwords = REMOVER.getStopWords()
        REMOVER.setInputCol("inter_wordlist")
        REMOVER.setOutputCol("inter_wordlist_two")

        stpwrds_rmvd_sdf = REMOVER.transform(VECTOR_DATAFRAME) \
                                    .select(["Claim_Id", "filename", "inter_wordlist_two"])

    else:
        pass
Example #10
0
def tokenize(df, string_cols):
    output = df
    for c in string_cols:
        output = output.withColumn('temp', f.coalesce(f.col(c), f.lit('')))
        tokenizer = RegexTokenizer(inputCol='temp',
                                   outputCol=c + "_tokens",
                                   pattern="\\W")
        remover = StopWordsRemover(inputCol=c + "_tokens",
                                   outputCol=c + "_swRemoved")
        output = tokenizer.transform(output)
        output = remover.transform(output)\
          .drop('temp', c+"_tokens")

    return output
Example #11
0
File: ml.py Project: ribonj/lsir
def removeStopWords(df, column):
    """
    Remove stop-words (like "the", "a", "I", etc.) from given column.
    The column must contain an array of strings.
    Transformation: array<string> --> array<string>
    """
    # creates remover to filter out common stop-words
    remover = StopWordsRemover(inputCol=column, outputCol='_'+column)
    
    # transform: array<string> --> array<string>
    df = remover.transform(df)
    
    df = replace(df, column, '_'+column)
    return df
Example #12
0
def main(data_file_csv):
    news_data = spark.read.csv(data_file_csv, header=True)
    snippet_text = news_data.select('snippet')
    snippet_text = snippet_text.replace(r'\\n\\n|\\n', ' ')
    udf_clean = udf(clean_words, StringType())
    data_cleaned = snippet_text.withColumn("snippet_c",
                                           udf_clean(snippet_text['snippet']))
    tokenizer = Tokenizer(inputCol="snippet_c", outputCol="tokens")
    data_tokenized = tokenizer.transform(data_cleaned)
    data_tokenized_2cols = data_tokenized.select("snippet_c", "tokens")
    SWR = StopWordsRemover(inputCol="tokens", outputCol="tokens_final")
    data_final = SWR.transform(data_tokenized_2cols)
    output = data_final.select('tokens_final')
    return output
Example #13
0
def movie_wordcloud(df):
    title_df = df.select("id", "title")
    # Clean text
    df_clean = title_df.select(
        "id",
        lower(regexp_replace('title', "[^a-zA-Z\\s]", "")).alias('title'))

    # Tokenize text
    tokenizer = Tokenizer(inputCol='title', outputCol='words_token')
    df_words_token = tokenizer.transform(df_clean).select('id', 'words_token')

    # Remove stop words
    remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean')
    df_words_no_stopw = remover.transform(df_words_token).select(
        'id', 'words_clean')

    #df_words_no_stopw.show(10)

    wordsDF = df_words_no_stopw.select(explode("words_clean").alias("words"))

    wordsDF = wordsDF.select(trim(wordsDF.words).alias("words"))
    #wordsDF.show()

    wordCountDF = wordsDF.groupBy("words").count().orderBy(
        desc("count")).limit(16)
    #wordCountDF.show()
    pandD = wordCountDF.toPandas()
    pandD.drop(0, inplace=True)

    sns.barplot(y='words', x='count', data=pandD)
    plt.title("Movie Title  Analysis")
    plt.xlabel('Words Frequency')
    plt.ylabel('Words')
    #plt.show()

    wordCountDF = wordsDF.groupBy("words").count().orderBy(
        desc("count")).limit(101)
    pandD = wordCountDF.toPandas()
    pandD.drop(0, inplace=True)  # drop first row

    wordcloudConvertDF = pandD.set_index('words').T.to_dict('records')
    wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=100, relative_scaling=0.5,
                          colormap='Dark2') \
        .generate_from_frequencies(dict(*wordcloudConvertDF))
    plt.figure(figsize=(14, 10))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.title("Words Cloud - Movie Titles")
    plt.axis('off')
    plt.show()
    """# Overview Cloud
Example #14
0
def get_top_n_in_array(df_lookup, top):
    df_lookup = df_lookup.select('_c0').distinct()
    tokenizer = Tokenizer(inputCol="_c0", outputCol="token_raw")
    remover = StopWordsRemover(inputCol="token_raw",
                               outputCol="token_filtered")
    df_lookup = tokenizer.transform(df_lookup)
    df_lookup = remover.transform(df_lookup)
    df_lookup = df_lookup.select(
        (F.explode("token_filtered"))).groupby("col").count().sort(
            'count', ascending=False)
    df_lookup = df_lookup.filter(F.length("col") > 2).limit(top).select(
        F.collect_list("col")).withColumnRenamed("collect_list(col)",
                                                 "to_match")
    return df_lookup
Example #15
0
    def fit(self):
        sqlContext = SparkSession.builder.getOrCreate()
        if self.test:
            df = sqlContext.sql(
                "select * from cmp_tmp_user_identification where dt='2014-01'")
        else:
            df = sqlContext.sql("select * from cmp_tmp_user_identification")

        if self.tweet and self.retweet:
            df = df.withColumn('content', F.concat('text', 'retweeted'))
        elif self.tweet:
            df = df.filter("retweeted==' '")
            df = df.withColumn('content', F.col('text'))
        elif self.retweet:
            df = df.filter('length(retweeted)>1')
            df = df.withColumn('content', F.col('retweeted'))

        df = df.withColumn('content', textCut(clean_text('content')))
        ##stopwords
        remover = StopWordsRemover(inputCol="content",
                                   outputCol="words",
                                   stopWords=self.stopwords)
        df = remover.transform(df)
        ## 清理空字符
        df = df.filter('size(words)>0')
        self.sentence_length_distribution = df.selectExpr(
            'size(words) as wz').groupBy('wz').count().toPandas().set_index(
                'wz').sort_index()
        ###vec
        cv = CountVectorizer(inputCol='words',
                             outputCol='vertors',
                             minDF=self.minDF,
                             minTF=self.minTF)
        model_cv = cv.fit(df)
        word2bag = model_cv.vocabulary
        self.baglen = len(word2bag)
        self.dictionary = dict(
            zip(word2bag, ['W' + str(i) for i in range(1, self.baglen)]))
        sc = SparkContext.getOrCreate()
        diction = sc.broadcast(self.dictionary)

        ## to English format to GCN
        df = df.withColumn('words_space', toSpaceSplit('words'))
        result_df = df.selectExpr('uid,label,identity,words_space'.split(','))
        ##aggregate to user level
        result_df = result_df.groupBy('uid', 'label', 'identity').agg(
            F.collect_list('words_space').alias('uid_words'))
        result_df = result_df.withColumn('uid_words', concat_uid('uid_words'))
        return result_df
Example #16
0
def pre_processing(cf):
    # Converting label -1 -> 0
    cf = cf.withColumn(
        "label",
        f.when(cf["label"] == -1,
               0).otherwise(cf["label"].cast(IntegerType())))

    # removing punctuations
    cf_pl = cf.rdd.map(
        lambda x: (re.sub(r'[^\w\s]', '', x.review).lower(), x.label)).toDF(
            ["review", "label"])

    # class imbalance solved here
    cf_pl = resample(cf_pl, 2)

    # Tokenize the reviews
    tokenizer = Tokenizer(inputCol="review", outputCol="tokenized")
    t = tokenizer.transform(cf_pl)

    # removing stop words
    stopwords_remover = StopWordsRemover(inputCol="tokenized",
                                         outputCol="filtered")
    s = stopwords_remover.transform(t)

    # removed empty strings in tokenized arrays
    s = s.rdd.map(lambda x:
                  (x.review, x.label, x.tokenized,
                   [y.strip() for y in x.filtered if y.strip()])).toDF(
                       ["review", "label", "tokenized", "filtered"])

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    try:
        s = s.rdd.map(lambda x: ([(lemmatizer.lemmatize(y, get_wordnet_pos(y)))
                                  for y in x.filtered])).toDF([
                                      "review", "label", "tokenized",
                                      "filtered", "lemmatized"
                                  ])
    except:
        # incase it fails, we use the filtered column
        s = s.withColumn('lemmatized', s.filtered)

    # temporary variable swap
    class_balancedDf = s

    # randomly shuffling class_balancedDf
    class_balancedDf = class_balancedDf.orderBy(rand())

    return class_balancedDf
def cleanDf(df):
    df = df.withColumn("decisionDate", (f.col("decisionDate").cast("date")))

    cleanFT_udf = f.udf(cleanFullText, ArrayType(StringType()))
    df = df.withColumn("fullTextCleaned", cleanFT_udf(df.fullText))

    cleanK_udf = f.udf(cleanKeywords, ArrayType(StringType()))
    df = df.withColumn("keywords", cleanK_udf(df.keywords))

    remover = StopWordsRemover(inputCol="fullTextCleaned",
                               outputCol="filteredFullText",
                               stopWords=stop_words)
    df = remover.transform(df)

    return df
Example #18
0
def process_tweet_text(df):
    """Removes punctuation, stop words from inputCol and the output is in the outputCol Column.

    Args:
        df (DataFrame): A DataFrame with the column from which Stop Words need to be removed.

    Returns:
        DataFrame: Applying StopWordsRemover with text_clean as the input column and filtered as the output column.
    """
    df = df.withColumn('text', split(removePunctuation(df['text']), ' ').alias('text'))
    stopWordList = list(string.punctuation) + ['http', 'https', 'rt','via','...','…','’','—','—:','“'] + StopWordsRemover.loadDefaultStopWords('english')
    remover = StopWordsRemover(inputCol="text", outputCol="filtered", stopWords = stopWordList)
    df = remover.transform(df)
    df = df.withColumn('tweet', array_join(df['filtered'], ' '))
    return df.select('date', 'tweet', 'hashtags')
def clean(df):
    tokenizer = Tokenizer(inputCol="body", outputCol="vector")
    remover = StopWordsRemover(inputCol="vector", outputCol="body")
    df = df.withColumn('body',
                       regexp_replace(col('body'), '<code>.*?</code>', ' '))
    df = df.withColumn('body', regexp_replace(col('body'), '<.*?>', ' '))
    df = df.withColumn('body', regexp_replace(col('body'), '&.*?;', ' '))
    df = df.withColumn(
        'body',
        regexp_replace(col('body'),
                       "[{0}]".format(re.escape(string.punctuation)), ' '))
    df = df.withColumn('body', regexp_replace(col('body'), '[^a-zA-Z]', ' '))
    df = tokenizer.transform(df).drop('body')
    df = remover.transform(df).drop('vector')
    return df
Example #20
0
def transform_tweet_data():
    # load
    data = spark.read.csv("dbfs:/FileStore/tweets/trump_insult_tweets_2014_to_2021.csv", header=True)
    # select
    data = data.select(split("tweet", " ").alias("tweet"), "target").dropna()
    # remove stopword
    remover = StopWordsRemover(inputCol='tweet', outputCol='tweet_clean')
    data = remover.transform(data)
    # stem
    stemmer = SnowballStemmer(language='english')
    stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
    data = data.withColumn("tweet_stemmed", stemmer_udf("tweet_clean")).select('target', 'tweet_stemmed')
    # clean
    data = data.withColumn("tweet", regexp_replace(concat_strings("tweet_stemmed"), '"', "")).select("tweet", "target")
    
    return data
def clean_data():
    """
    Clean the Tweet by removing punctuations and stop words
    :return cleaned data:
    """
    data = sc.textFile("data/data.txt")
    col_rdd = data.map(lambda x: (x.split('\t')[0], x[-1]))
    punctuation_removed_rdd = col_rdd.map(
        lambda x: (remove_punctuation(x[0]), float(x[1])))

    data_df = sqlContext.createDataFrame(punctuation_removed_rdd,
                                         ["text", "label"])
    remover = StopWordsRemover(inputCol="text",
                               outputCol="words",
                               stopWords=stopwords.words('english'))
    return remover.transform(data_df).select(["label", "words"])
def preprocessing_titles(path,name):
    query = preprocessData(path)
    tokenizer = Tokenizer(inputCol="title", outputCol="tokenized_title")
    wordsData = tokenizer.transform(query)
    #after Stopword removal
    remover = StopWordsRemover(inputCol="tokenized_title", outputCol="filtered")
    wordsData= remover.transform(wordsData)
    
    df = wordsData.map(lambda x:x['id']).zipWithUniqueId().toDF(["id","index"])
    df.registerTempTable("indices")
    wordsData.registerTempTable("words")
    
    qr = sqlContext.sql("SELECT index,words.id,filtered FROM indices JOIN words ON words.id = indices.id")
    if name!='':
        exportOnS3(qr,"s3a://redit-preprocessed/",name)
    qr = qr.map(lambda Row:(Row['index'],Row['id'],Row['filtered']))
    def test_stop_words_remover2(self):
        data = self.spark.createDataFrame([(["a", "b", "c"],)], ["text"])
        model = StopWordsRemover(inputCol="text", outputCol="words", stopWords=["b"])

        model_onnx = convert_sparkml(model, 'Sparkml StopWordsRemover',
                                     [('text', StringTensorType([None]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = numpy.array(predicted.toPandas().words.values[0])
        data_np = numpy.array(data.toPandas().text.values[0])
        paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlStopWordsRemover")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['words'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)
Example #24
0
 def append_tokens(self,df):
     """
     Creates tokens from the pagename column in the dataframe then removes
      stop-words from the tokens. Adds the tokens under the column rawTokens and tokens.
     Args:
         :param df: Dataframe to add token columns to.
     Returns:
         :return: Dataframe with new columns rawTokens and tokens.
     """
     #Tokenize pagename and convert tokens to their stem words.
     tokenize_udf = udf(tokenize_porter, returnType=ArrayType(StringType()))
     df = df.withColumn('rawTokens', tokenize_udf(df['pagename']))
     #Remove stop words.
     stop_words_remover = StopWordsRemover(inputCol="rawTokens", outputCol="tokens")
     df = stop_words_remover.transform(df)
     return df
Example #25
0
def topicPredict(inputs):
    #output_path = "/user/llbui/bigdata45_500"
    output_path = "C:/Users/linhb/bigdata45_500"
    query = inputs
    n = 10  #number of similar document to return
    feature = "abstract"  #feature to compare

    df = sc.parallelize([(0, query)]).toDF(["id", feature])

    tokenizer = RegexTokenizer(inputCol=feature,
                               outputCol="words",
                               pattern="\\P{Alpha}+")
    df2 = tokenizer.transform(df)

    remover = StopWordsRemover(inputCol="words", outputCol="words2")
    df3 = remover.transform(df2)

    udf_remove_words = udf(lambda x: remove_words(x), ArrayType(StringType()))
    df4 = df3.withColumn("words3", udf_remove_words(df3.words2))

    # text to feature vector - TF_IDF
    countTF_model = CountVectorizerModel.load(output_path + "/tf_model")
    df_countTF = countTF_model.transform(df4)

    idf_model = IDFModel.load(output_path + "/idf_model")
    df_IDF = idf_model.transform(df_countTF)

    # LDA Model
    lda_model = LocalLDAModel.load(output_path + "/lda_model")

    #output topics for document -> topicDistribution
    df_Feature = lda_model.transform(df_IDF)
    feature_vector = df_Feature.select("id",
                                       "topicDistribution").collect()[0][1]
    print("Feature Vector:", feature_vector)

    #Load existing document
    df_Document = sqlCt.read.load(output_path + "/topicDistribution.parquet")
    udf_cosineSimilarity = udf(
        lambda x_vector: cosineSimilarity(x_vector, feature_vector),
        FloatType())
    df_Similarity = df_Document.withColumn(
        "similarity", udf_cosineSimilarity("topicDistribution"))
    df_Similarity_Sorted = df_Similarity.sort(desc("similarity"))
    return df_Similarity_Sorted.limit(n).select("_id", "title", "abstract",
                                                "url",
                                                "topicDistribution").collect()
Example #26
0
def aggregate_spark(data, features, args):
    from pyspark.ml.feature import StopWordsRemover, RegexTokenizer
    import pyspark.sql.functions as F
    from pyspark.sql.types import IntegerType

    regexTokenizer = RegexTokenizer(inputCol=features["col"],
                                    outputCol="token_list",
                                    pattern="\\W")
    regexTokenized = regexTokenizer.transform(data)

    remover = StopWordsRemover(inputCol="token_list",
                               outputCol="filtered_word_list")
    max_review_length_row = (remover.transform(regexTokenized).select(
        F.size(F.col("filtered_word_list")).alias("word_count")).agg(
            F.max(F.col("word_count")).alias("max_review_length")).collect())

    return max_review_length_row[0]["max_review_length"]
Example #27
0
def convertToVec(df, sc, ss, outputName, inputCol='tokens'):
    print('\n\n\n Removing Stopwords... \n\n\n')
    remover=StopWordsRemover(inputCol=inputCol, outputCol='nostops', stopWords=StopWordsRemover.loadDefaultStopWords('english'))
    df=remover.transform(df)

    cv=CountVectorizer(inputCol='nostops', outputCol='vectors',minTF=1.0)
    vecModel=cv.fit(df)
    new=False
    if new:
        print('\n\n\n Get Vocab... \n\n\n')
        inv_voc=vecModel.vocabulary 
        f = codecs.open(outputName+'_vocab.txt', encoding='utf-8', mode='w')
        for item in inv_voc:
            f.write(u'{0}\n'.format(item))
        f.close()
    vectors= vecModel.transform(df).select('id','subreddit','vectors')
    return vectors
    def preprocessDF(self, df, cols):
        # concatenation
        df_concat = df.withColumn("concat", concat_ws(' ', *cols))

        # Split at whitespace and characters that are not letter
        tokenizer = RegexTokenizer(inputCol="concat",
                                   outputCol="words",
                                   pattern=r'\W+')
        df_tokenizer = tokenizer.transform(df_concat)

        # stopword remover
        remover = StopWordsRemover(inputCol="words",
                                   outputCol="joinKey",
                                   stopWords=self.stopWordsBC.value)
        df_remover = remover.transform(df_tokenizer) \
            .drop("concat").drop("words")
        return df_remover
Example #29
0
    def sentiment_validate(lrModel):
    rdd = spark_context.textFile("/user/SentimentalData/Subset100k.csv")

    header = rdd.first();
    rdd = rdd.filter(lambda row: row != header)

    spark = getSparkSessionInstance(rdd.context.getConf())
    
    r = rdd.mapPartitions(lambda x : csv.reader(x))

    parts = r.map(lambda x : Row(sentence=str.strip(x[3]), label=int(x[1])))

    partsDF = spark.createDataFrame(parts)
    
    partsDF.show(truncate=False)
    
    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

    tokenized = tokenizer.transform(partsDF)

    tokenized.show(truncate=False)

    remover = StopWordsRemover(inputCol="words", outputCol="base_words")

    base_words = remover.transform(tokenized)

    base_words.show(truncate=False)

    train_data_raw = base_words.select("base_words", "label")

    train_data_raw.show(truncate=False)

    base_words = train_data_raw.select("base_words")

    base_words.show(truncate=False)

    word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="base_words", outputCol="features")

    model = word2Vec.fit(train_data_raw)

    final_train_data2 = model.transform(train_data_raw)

    final_train_data2.show()
    
     final_train_data2 = final_train_data2.select("label", "features")
def remove_l_c_words(df, least, most):
    # Let's find out which words we keep
    vocabulary = df.map(lambda row: row.tweets).reduce(lambda x, y: x + y)
    count = sc.parallelize(vocabulary).map(lambda word: (word, 1)).reduceByKey(
        add)
    count = count.sortBy(lambda wc: wc[1], ascending=False)
    # Add to the list of stopwords
    stop_words_lc = count.filter(lambda wc: wc[1] == least).map(
        lambda wc: wc[0]).collect()
    if most < 1:
        stop_words = stop_words_lc
    else:
        stop_words_mc = count.map(lambda wc: wc[0]).take(most)
        stop_words = stop_words_lc + stop_words_mc
    remover = StopWordsRemover(inputCol="tweets",
                               outputCol='cleaned_tweets',
                               stopWords=stop_words)
    return remover.transform(df)
Example #31
0
def preprocess_text(df):

    df_select = df.dropna(subset=["raw_tweet_text"]).select(cols_select)
    # 1. clean text
    df_select_clean = (df_select.withColumn(
        "tweet_text",
        F.regexp_replace(
            "raw_tweet_text", r"[@#&][A-Za-z0-9_-]+", " ")).withColumn(
                "tweet_text",
                F.regexp_replace(
                    "tweet_text", r"\w+:\/\/\S+", " ")).withColumn(
                        "tweet_text",
                        F.regexp_replace(
                            "tweet_text", r"[^A-Za-z]", " ")).withColumn(
                                "tweet_text",
                                F.regexp_replace(
                                    "tweet_text", r"\s+", " ")).withColumn(
                                        "tweet_text",
                                        F.lower(
                                            F.col("tweet_text"))).withColumn(
                                                "tweet_text",
                                                F.trim(F.col("tweet_text"))))

    # tokenize
    tokenizer = Tokenizer(inputCol="tweet_text", outputCol="tokens")

    # 2.2. remove stopwords
    stopword_remover = StopWordsRemover(inputCol="tokens",
                                        outputCol="remove_stop")
    stopword_remover.setStopWords(stopwords_list)

    #2.3. stemming
    # TODO: how to modify the stemming function into a transformer?
    stemmer = PorterStemmer()
    # more straightforward to use lambda
    stem_udf = F.udf(lambda l: [stemmer.stem(word) for word in l],
                     returnType=ArrayType(StringType()))

    df_tokenized = tokenizer.transform(df_select_clean)
    df_rmstop = stopword_remover.transform(df_tokenized)
    df_stemmed = df_rmstop.withColumn("stemmed",
                                      stem_udf(F.col("remove_stop")))

    return df_stemmed
Example #32
0
def tokenize_df(df): 
    
    tokenizer = Tokenizer(inputCol="text", outputCol="vector")
    remover = StopWordsRemover()
    remover.setInputCol("vector")
    remover.setOutputCol("vector_no_stopw")
    stopwords = remover.getStopWords()
    stemmer = PorterStemmer()
    stemmer_udf = udf(lambda x: stem(x), ArrayType(StringType()))

    df = df.select(clean_text(col("text")).alias("text"))
    df = tokenizer.transform(df).select("vector")
    df = remover.transform(df).select("vector_no_stopw")
    df = (df
        .withColumn("vector_stemmed", stemmer_udf("vector_no_stopw"))
        .select("vector_stemmed")
        )
    
    return df
Example #33
0
def bayes_cv(business_id):
    """
    Crossvalidation of bayes model
    """
    spark = yelp_lib.spark
    review = yelp_lib.get_parq('review')
    business_df = review.filter(review['business_id'] == business_id)

    regexTokenizer = RegexTokenizer(inputCol="text",
                                    outputCol="words",
                                    pattern="\\W")
    wordsDataFrame = regexTokenizer.transform(business_df)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    cleaned = remover.transform(wordsDataFrame)

    star_mapping = {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 1.0, 5: 1.0}

    cleaned = cleaned.replace(star_mapping, 'stars')
    cleaned = cleaned.withColumn("stars", cleaned["stars"].cast("double"))

    cv = CountVectorizer(inputCol="filtered", outputCol="features")
    model = cv.fit(cleaned)
    vectorized = model.transform(cleaned)

    vectorized = vectorized.select(
        col('stars').alias('label'), col('features'))

    splits = vectorized.randomSplit([0.6, 0.4], 1234)
    train = splits[0]
    test = splits[1]

    # create the trainer and set its parameters
    nb = NaiveBayes(smoothing=1.0)
    # train the model
    nb_model = nb.fit(train)
    # compute accuracy on the test set
    result = nb_model.transform(test)

    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    return "Accuracy: " + str(evaluator.evaluate(predictionAndLabels))
Example #34
0
def preprocessing_titles(path, name):
    query = preprocessData(path)
    tokenizer = Tokenizer(inputCol="title", outputCol="tokenized_title")
    wordsData = tokenizer.transform(query)
    #after Stopword removal
    remover = StopWordsRemover(inputCol="tokenized_title",
                               outputCol="filtered")
    wordsData = remover.transform(wordsData)

    df = wordsData.map(lambda x: x['id']).zipWithUniqueId().toDF(
        ["id", "index"])
    df.registerTempTable("indices")
    wordsData.registerTempTable("words")

    qr = sqlContext.sql(
        "SELECT index,words.id,filtered FROM indices JOIN words ON words.id = indices.id"
    )
    if name != '':
        exportOnS3(qr, "s3a://redit-preprocessed/", name)
    qr = qr.map(lambda Row: (Row['index'], Row['id'], Row['filtered']))
Example #35
0
def pre_process_data(df):
    df_collumn = df.withColumn(
        "text",
        regexp_replace(lower(df["text"]), "[$&+,:;=?@#|'<>.-^*()%!]", ""))
    df_without = df_collumn.withColumn(
        "text", regexp_replace(lower(df_collumn["text"]), "-", " "))
    df_read = df_without.select('*').withColumn("id",
                                                monotonically_increasing_id())
    # Tokenize data
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    df_tokenized = tokenizer.transform(df_read)
    #Remove Stop Words
    language = "portuguese"
    remover = StopWordsRemover(
        inputCol="words",
        outputCol="filtered",
        stopWords=StopWordsRemover.loadDefaultStopWords(language))
    df_clean = remover.transform(df_tokenized)
    #Return dataframe
    return df_clean
def init_base_df(file_path=default_file_path):
    # Set legacy parsing as Spark 3.0+ cannot use 'E' for timestamp
    spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

    print("Loading", default_file_path)

    raw_df = (
        spark.read.format("csv")
        .option("inferSchema", True)
        .load(file_path)
        .toDF("polarity", "tweet_id", "datetime", "query", "user", "text")
    )

    # Parse string to timestamp
    time_parsed_df = raw_df.withColumn(
        "timestamp", to_timestamp("datetime", "EEE MMM dd HH:mm:ss zzz yyyy")
    )

    df = time_parsed_df.drop("query").drop("datetime")

    # Shift polarity from a range of [0:4], to [-1:1]
    scaled_polarity_df = df.withColumn("sentiment", (col("polarity") / 2) - 1).drop(
        "polarity"
    )

    clean_text_df = df.select(clean_text(col("text")).alias("text"), "tweet_id")

    tokenizer = Tokenizer(inputCol="text", outputCol="vector")
    vector_df = tokenizer.transform(clean_text_df).select("vector", "tweet_id")

    remover = StopWordsRemover()
    stopwords = remover.getStopWords()

    remover.setInputCol("vector")
    remover.setOutputCol("tokens")

    tokens_no_stopw_df = remover.transform(vector_df).select("tokens", "tweet_id")

    tweets_with_tokens_df = scaled_polarity_df.join(tokens_no_stopw_df, on=["tweet_id"])

    return tweets_with_tokens_df
Example #37
0
def get_top_words(dataset, signatures):
    # TODO: Use stemmers for the languages supported by http://www.nltk.org/api/nltk.stem.html#nltk.stem.snowball.SnowballStemmer
    # Or translate comments in other languages using the free Microsoft Translate API.
    sentenceData = dataset.filter(dataset['user_comments'].isNotNull() & (dataset['useragent_locale'].isNull() | (functions.instr(dataset['useragent_locale'], 'en') == 1)))

    if sentenceData.rdd.isEmpty():
        return dict()

    # Tokenize comments.
    tokenizer = Tokenizer(inputCol='user_comments', outputCol='words')
    wordsData = tokenizer.transform(sentenceData)

    # Remove duplicate words from comments.
    wordsData = wordsData.rdd.map(lambda p: (p['signature'], list(set(p['words'])))).reduceByKey(lambda x, y: x + y).toDF(['signature', 'words'])

    if wordsData.rdd.isEmpty():
        print("[WARNING]: wordsData is empty, sentenceData wasn't.")
        return dict()

    # Clean comment words by removing puntuaction and stemming.
    def clean_word(w):
        return re.sub('\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{|\/|\\\\', '', stem(w.lower()))

    wordsData = wordsData.rdd.map(lambda p: (p['signature'], [clean_word(w) for w in p['words']])).toDF(['signature', 'words'])

    # XXX: Useless with TF-IDF?
    remover = StopWordsRemover(inputCol='words', outputCol='filtered')
    cleanWordsData = remover.transform(wordsData)

    cv = CountVectorizer(inputCol='filtered', outputCol='features')
    model = cv.fit(cleanWordsData)
    featurizedData = model.transform(cleanWordsData)

    idf = IDF(inputCol='features', outputCol='tfidf_features')
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    bests_per_doc = rescaledData.filter(rescaledData.signature.isin(signatures)).rdd.map(lambda p: (p['signature'], sorted(zip(p['tfidf_features'].indices, p['tfidf_features'].values), key=lambda i: i[1], reverse=True)[:10])).collect()

    return dict([(signature, [model.vocabulary[best] for best, val in bests]) for signature, bests in bests_per_doc])
Example #38
0
        )

strip_tags_udf = udf(strip_tags)
tokenizer = Tokenizer(inputCol="comment_clean", outputCol="words")
stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="tokens")

# Load data
comments = sqlContext.read.json(fn)

# Calcualte tokens dataframe as one pipeline
tokens = stopWordsRemover.transform(
             tokenizer.transform(comments\
                 .withColumn("comment_clean", strip_tags_udf(comments["comment_text"]))\
             )\
         )\
         .select(explode("tokens").alias("token"))\
         .groupBy("token")\
         .count()\
         .orderBy("count", ascending=False)\
         .select("count")\
         .limit(1000)

# Switch to Pandas
tokens_pdf = tokens.toPandas()
tokens_pdf = tokens_pdf.ix[1:]
tokens_pdf["rank"] = range(1, tokens_pdf.shape[0] + 1)
print(tokens_pdf.head())

# Make a graph
fig = sns.jointplot(x="rank", y="count", data=tokens_pdf)
fig.savefig('temp.png')
  .setInputCol("Description")\
  .setOutputCol("DescOut")\
  .setPattern(" ")\
  .setGaps(False)\
  .setToLowercase(True)
rt.transform(sales.select("Description")).show(20, False)


# COMMAND ----------

from pyspark.ml.feature import StopWordsRemover
englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
stops = StopWordsRemover()\
  .setStopWords(englishStopWords)\
  .setInputCol("DescOut")
stops.transform(tokenized).show()


# COMMAND ----------

from pyspark.ml.feature import NGram
unigram = NGram().setInputCol("DescOut").setN(1)
bigram = NGram().setInputCol("DescOut").setN(2)
unigram.transform(tokenized.select("DescOut")).show(False)
bigram.transform(tokenized.select("DescOut")).show(False)


# COMMAND ----------

from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer()\
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import StopWordsRemover
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("StopWordsRemoverExample")\
        .getOrCreate()

    # $example on$
    sentenceData = spark.createDataFrame([
        (0, ["I", "saw", "the", "red", "balloon"]),
        (1, ["Mary", "had", "a", "little", "lamb"])
    ], ["id", "raw"])

    remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
    remover.transform(sentenceData).show(truncate=False)
    # $example off$

    spark.stop()
Example #41
0
from pyspark.sql import SQLContext
from pyspark.sql.functions import desc, explode
from pyspark.sql.types import *
from storage import Sqlite

PARTITIONS = 500
THRESHOLD = 50

if __name__ == "__main__":
    conf = SparkConf().setAppName("reddit")
    conf.set('spark.serializer', 'org.apache.spark.serializer.KryoSerializer')
    conf.set('spark.local.dir', '/mnt/work')
    conf.set('spark.driver.maxResultSize', '12g')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    fields = [StructField("subreddit", StringType(), True),
          StructField("body", StringType(), True)]
    rawDF = sqlContext.read.json("file:///mnt/s3/2015/*", StructType(fields))
    # split comments into words
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsDataFrame = tokenizer.transform(rawDF)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    filteredDataFrame = remover.transform(wordsDataFrame)
    # explode terms into individual rows
    termDataFrame = filteredDataFrame.select(['subreddit', explode(filteredDataFrame.filtered).alias("term")])
    # group by subreddit and term, then count occurence of term in subreddits
    countsDataFrame = termDataFrame.groupBy(['subreddit', 'term']).count()

    db =  Sqlite()
    countsDataFrame.select(['subreddit', 'term', 'count']).filter('count > {}'.format(THRESHOLD)).foreachPartition(db.saveSubredditWords)
 def preprocess_tweets(tweets):
     tokenizer = Tokenizer(inputCol="text", outputCol="words")
     tweets = tokenizer.transform(tweets)
     remover = StopWordsRemover(inputCol="words", outputCol="filtered")
     tweets = remover.transform(tweets)
     return tweets