def main(sc):
    sql_context = SQLContext(sc)
    all_data = get_all_data()

    # Input data: Each row is a bag of words from a sentence or document.
    training_data = [(id_gen.next(), text.split(" ")) for text in all_data]
    documentdf = sql_context.createDataFrame(training_data, ["id", "text"])

    remover = StopWordsRemover(inputCol="text", outputCol="text_filtered")
    cleaned_document = remover.transform(documentdf)

    # Learn a mapping from words to Vectors.
    word2vec = Word2Vec(vectorSize=len(training_data),
                        inputCol="text_filtered",
                        outputCol="result")
    model = word2vec.fit(cleaned_document)
    matrix = column_similarities(model.transform(cleaned_document))

    # We use the size of the target data to filter only
    # products of target data to filter data and avoid
    # products of taret data to itself
    values = matrix.entries.filter(
        lambda x: x.j >= TARGET_DATA_SIZE and x.i < TARGET_DATA_SIZE).sortBy(
        keyfunc=lambda x: x.value, ascending=False).map(
        lambda x: x.j).distinct().take(100)

    training_data_index = dict(training_data)
    for position, item in enumerate(values):
        line = " ".join(training_data_index[int(item)])
        print('%d -> %s' % (position, line.encode('utf-8')))
def fit_kmeans(spark, products_df):
    step = 0

    step += 1
    tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer")

    step += 1
    stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords")

    step += 1
    tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16)

    step += 1
    idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf")

    step += 1
    normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer")

    step += 1
    kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20)

    kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans])

    model = kmeans_pipeline.fit(products_df)
    words_prediction = model.transform(products_df)
    model.save("./kmeans")  # the whole machine learning instance is saved in a folder
    return model, words_prediction
Exemple #3
0
def remove_stop_words(p_df, in_column, out_column):
    """
    Removes stop words from a column in a DataFrame. The column must be a list of words.
    :param p_df: A DataFrame.
    :param in_column: Name of the input column.
    :param out_column: Name of the output column.
    :return: A DataFrame.
    """    
    remover = StopWordsRemover(inputCol=in_column, outputCol=out_column)
    return remover.transform(p_df)
Exemple #4
0
def removeStopWords(df, column):
    """
    Remove stop-words (like "the", "a", "I", etc.) from given column.
    The column must contain an array of strings.
    Transformation: array<string> --> array<string>
    """
    # creates remover to filter out common stop-words
    remover = StopWordsRemover(inputCol=column, outputCol='_'+column)
    
    # transform: array<string> --> array<string>
    df = remover.transform(df)
    
    df = replace(df, column, '_'+column)
    return df
Exemple #5
0
 def append_tokens(self,df):
     """
     Creates tokens from the pagename column in the dataframe then removes
      stop-words from the tokens. Adds the tokens under the column rawTokens and tokens.
     Args:
         :param df: Dataframe to add token columns to.
     Returns:
         :return: Dataframe with new columns rawTokens and tokens.
     """
     #Tokenize pagename and convert tokens to their stem words.
     tokenize_udf = udf(tokenize_porter, returnType=ArrayType(StringType()))
     df = df.withColumn('rawTokens', tokenize_udf(df['pagename']))
     #Remove stop words.
     stop_words_remover = StopWordsRemover(inputCol="rawTokens", outputCol="tokens")
     df = stop_words_remover.transform(df)
     return df
def preprocessing_titles(path,name):
    query = preprocessData(path)
    tokenizer = Tokenizer(inputCol="title", outputCol="tokenized_title")
    wordsData = tokenizer.transform(query)
    #after Stopword removal
    remover = StopWordsRemover(inputCol="tokenized_title", outputCol="filtered")
    wordsData= remover.transform(wordsData)
    
    df = wordsData.map(lambda x:x['id']).zipWithUniqueId().toDF(["id","index"])
    df.registerTempTable("indices")
    wordsData.registerTempTable("words")
    
    qr = sqlContext.sql("SELECT index,words.id,filtered FROM indices JOIN words ON words.id = indices.id")
    if name!='':
        exportOnS3(qr,"s3a://redit-preprocessed/",name)
    qr = qr.map(lambda Row:(Row['index'],Row['id'],Row['filtered']))
class SentimentalPipelineEngine(PipelineEngine):
    def __init__(self, cv):
        super(SentimentalPipelineEngine, self).__init__(cv)
        self.tokenizer_map = [TweetTokenizer()]
        self.ngram_map = [1]
        self.hashing_tf_map = [pow(2, 20)]
        self.clf_map = [0.1]
        self.stages = self._build_stages()
        self.pipeline = Pipeline(stages=self.stages)
        self.param_grid = self._build_param_grid()

    def _build_stages(self):
        self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed")
        self.tokenizer = Tokenizzzer(inputCol=self.bs_parser.getOutputCol(), outputCol="words")
        self.stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered")
        self.porter = PorterStemmerTransformer(inputCol=self.stopwords_remover.getOutputCol(), outputCol="stemmed")
        self.ngram = NGram(inputCol=self.porter.getOutputCol(), outputCol="ngrams")
        self.hashing_tf = HashingTF(inputCol=self.ngram.getOutputCol(), outputCol="features")
        self.idf = IDF(inputCol="features", outputCol="idf_features")
        self.normalizer = Normalizer(inputCol="idf_features", outputCol="norm_features", p=1.0)
        self.clf = LogisticRegression(featuresCol='norm_features', regParam=0.1)
        # self.clf = MultilayerPerceptronClassifier(featuresCol="norm_features", maxIter=1000, layers=[self.hashing_tf.getNumFeatures(), 200, 100, 2])
        return [self.bs_parser, self.tokenizer, self.stopwords_remover, self.porter, self.ngram, self.hashing_tf, self.idf, self.normalizer, self.clf]

    def _build_param_grid(self):
        param_grid_builder = ParamGridBuilder()
        param_grid_builder.addGrid(self.tokenizer.tokenizer, self.tokenizer_map)
        param_grid_builder.addGrid(self.ngram.n, self.ngram_map)
        param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map)
        param_grid_builder.addGrid(self.clf.regParam, self.clf_map)
        return param_grid_builder.build()
 def _build_stages(self):
     self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed")
     self.tokenizer = Tokenizzzer(inputCol=self.bs_parser.getOutputCol(), outputCol="words")
     self.stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered")
     self.porter = PorterStemmerTransformer(inputCol=self.stopwords_remover.getOutputCol(), outputCol="stemmed")
     self.ngram = NGram(inputCol=self.porter.getOutputCol(), outputCol="ngrams")
     self.hashing_tf = HashingTF(inputCol=self.ngram.getOutputCol(), outputCol="features")
     self.idf = IDF(inputCol="features", outputCol="idf_features")
     self.normalizer = Normalizer(inputCol="idf_features", outputCol="norm_features", p=1.0)
     self.clf = LogisticRegression(featuresCol='norm_features', regParam=0.1)
     # self.clf = MultilayerPerceptronClassifier(featuresCol="norm_features", maxIter=1000, layers=[self.hashing_tf.getNumFeatures(), 200, 100, 2])
     return [self.bs_parser, self.tokenizer, self.stopwords_remover, self.porter, self.ngram, self.hashing_tf, self.idf, self.normalizer, self.clf]
def create_pipeline(model_type, num_features=10000):
    """
    Defines pipeline from BOW to prediction.
    """

    remover = StopWordsRemover(inputCol="bow", outputCol="words")
    hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts", numFeatures=num_features)
    tfidf = IDF(inputCol=hashingTF.getOutputCol(),
                outputCol="features")

    if model_type == 'log_reg':
        model = LogisticRegression()
    elif model_type == 'gbt':
        model = GBTClassifier()
    elif model_type == 'naive_bayes':
        model = NaiveBayes()
    elif model_type == 'rf':
        model = RandomForestClassifier()

    return Pipeline(stages=[remover, hashingTF, tfidf,
                                model])
Exemple #10
0
def get_top_words(dataset, signatures):
    # TODO: Use stemmers for the languages supported by http://www.nltk.org/api/nltk.stem.html#nltk.stem.snowball.SnowballStemmer
    # Or translate comments in other languages using the free Microsoft Translate API.
    sentenceData = dataset.filter(dataset['user_comments'].isNotNull() & (dataset['useragent_locale'].isNull() | (functions.instr(dataset['useragent_locale'], 'en') == 1)))

    if sentenceData.rdd.isEmpty():
        return dict()

    # Tokenize comments.
    tokenizer = Tokenizer(inputCol='user_comments', outputCol='words')
    wordsData = tokenizer.transform(sentenceData)

    # Remove duplicate words from comments.
    wordsData = wordsData.rdd.map(lambda p: (p['signature'], list(set(p['words'])))).reduceByKey(lambda x, y: x + y).toDF(['signature', 'words'])

    if wordsData.rdd.isEmpty():
        print("[WARNING]: wordsData is empty, sentenceData wasn't.")
        return dict()

    # Clean comment words by removing puntuaction and stemming.
    def clean_word(w):
        return re.sub('\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{|\/|\\\\', '', stem(w.lower()))

    wordsData = wordsData.rdd.map(lambda p: (p['signature'], [clean_word(w) for w in p['words']])).toDF(['signature', 'words'])

    # XXX: Useless with TF-IDF?
    remover = StopWordsRemover(inputCol='words', outputCol='filtered')
    cleanWordsData = remover.transform(wordsData)

    cv = CountVectorizer(inputCol='filtered', outputCol='features')
    model = cv.fit(cleanWordsData)
    featurizedData = model.transform(cleanWordsData)

    idf = IDF(inputCol='features', outputCol='tfidf_features')
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    bests_per_doc = rescaledData.filter(rescaledData.signature.isin(signatures)).rdd.map(lambda p: (p['signature'], sorted(zip(p['tfidf_features'].indices, p['tfidf_features'].values), key=lambda i: i[1], reverse=True)[:10])).collect()

    return dict([(signature, [model.vocabulary[best] for best, val in bests]) for signature, bests in bests_per_doc])
sc = SparkContext(appName="Tweet")
spark = SparkSession(sc)

sc.setLogLevel("WARN")
# read the dataset
training_set = spark.read.csv(
    '../tap/spark/dataset/training_set_sentipolc16.csv',
    schema=schema,
    header=True,
    sep=',')
training_set

# define stage 1: tokenize the tweet text
stage_1 = RegexTokenizer(inputCol='tweet', outputCol='tokens', pattern='\\W')
# define stage 2: remove the stop words
stage_2 = StopWordsRemover(inputCol='tokens', outputCol='filtered_words')
# define stage 3: create a word vector of the size 100
stage_3 = Word2Vec(inputCol='filtered_words',
                   outputCol='vector',
                   vectorSize=100)
# define stage 4: Logistic Regression Model
model = LogisticRegression(featuresCol='vector', labelCol='positive')
# setup the pipeline
pipeline = Pipeline(stages=[stage_1, stage_2, stage_3, model])

# fit the pipeline model with the training data
pipelineFit = pipeline.fit(training_set)

modelSummary = pipelineFit.stages[-1].summary
modelSummary.accuracy
Exemple #12
0
 def test_stopwordsremover(self):
     dataset = self.spark.createDataFrame([Row(input=["a", "panda"])])
     stopWordRemover = StopWordsRemover(inputCol="input", outputCol="output")
     # Default
     self.assertEqual(stopWordRemover.getInputCol(), "input")
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["panda"])
     self.assertEqual(type(stopWordRemover.getStopWords()), list)
     self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], basestring))
     # Custom
     stopwords = ["panda"]
     stopWordRemover.setStopWords(stopwords)
     self.assertEqual(stopWordRemover.getInputCol(), "input")
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["a"])
     # with language selection
     stopwords = StopWordsRemover.loadDefaultStopWords("turkish")
     dataset = self.spark.createDataFrame([Row(input=["acaba", "ama", "biri"])])
     stopWordRemover.setStopWords(stopwords)
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, [])
     # with locale
     stopwords = ["BELKİ"]
     dataset = self.spark.createDataFrame([Row(input=["belki"])])
     stopWordRemover.setStopWords(stopwords).setLocale("tr")
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, [])
# split the text into tokens
# removed stop words
# applied the hashing trick
# converted the data from counts to IDF and
# trained a logistic regression model.
# Each of these steps was done independently. This seems like a great application for a pipeline!

# Instructions
# 100 XP
# Create an object for splitting text into tokens.
# Create an object to remove stop words. Rather than explicitly giving the input column name, use the getOutputCol() method on the previous object.
# Create objects for applying the hashing trick and transforming the data into a TF-IDF. Use the getOutputCol() method again.
# Create a pipeline which wraps all of the above steps as well as an object to create a Logistic Regression model.

from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

# Break text into tokens at non-word characters
tokenizer = Tokenizer(inputCol='text', outputCol='words')

# Remove stop words
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                           outputCol='terms')

# Apply the hashing trick and transform to TF-IDF
hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash")
idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features")

# Create a logistic regression object and add everything to a pipeline
logistic = LogisticRegression()
pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])
Exemple #14
0
appName = "News label prediction"
master = 'local'

spark = SparkConf().setAppName(appName).setMaster(master).set(
    'spark.executor.memory', '4G').set('spark.driver.memory',
                                       '45G').set('spark.driver.maxResultSize',
                                                  '10G')
sc = SparkContext(conf=spark)
sqlContext = SQLContext(sc)

df = sqlContext.createDataFrame(indexNewsList, ["label", "text"])

tokenizer = Tokenizer(inputCol="text", outputCol="words")

remover = StopWordsRemover(inputCol="words", outputCol="filtered")

hashingTF = HashingTF(inputCol="filtered",
                      outputCol="rawFeatures",
                      numFeatures=10000)

idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)

nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

rf = RandomForestClassifier(labelCol="label",
                            featuresCol="features",
                            numTrees=100,
                            maxDepth=15,
                            maxBins=32)
Exemple #15
0
        .orderBy(col("count").desc()) \
        .show()

# set seed for reproducibility
(training_data, test_data) = df.randomSplit([0.7, 0.3], seed=100)
print("Training data count: " + str(training_data.count()))
print("Test data count: " + str(test_data.count()))

# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="text",
                                outputCol="words",
                                pattern="\\W")

# stop words
add_stopwords = ["http", "https", "amp", "rt", "t", "c", "the", "RT", "@"]
stopwordsRemover = StopWordsRemover(
    inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

# bag of words count
countVectors = CountVectorizer(inputCol="filtered",
                               outputCol="features",
                               vocabSize=10000,
                               minDF=5)

# convert string labels to indexes
indexer = StringIndexer(inputCol="polarity", outputCol="label")

# feature-selector
selector = ChiSqSelector(numTopFeatures=10,
                         featuresCol="features",
                         outputCol="selectedFeatures",
                         labelCol="label")
Exemple #16
0
names_df = df.select('name')
names_df.show()
# listings.filter(listings["name"].isNotNull())

#%%
names_df = names_df.dropna(subset='name')
names_df.show()

#%%
tokenizer = Tokenizer(inputCol="name", outputCol="words")
wordsData = tokenizer.transform(names_df)
wordsData.show()

#%%
stopwords = []
stopwords.extend(StopWordsRemover.loadDefaultStopWords('english'))
remover = StopWordsRemover(inputCol="words",
                           outputCol="cleanedWords",
                           stopWords=stopwords)
cleanedWordsData = remover.transform(wordsData)
cleanedWordsData.show()

#%%
hashingTF = HashingTF(numFeatures=4096,
                      inputCol="cleanedWords",
                      outputCol="tfFeatures")
tfWordsData = hashingTF.transform(cleanedWordsData)
tfWordsData.show()

#%%
idf = IDF(inputCol="tfFeatures", outputCol="tfIdfFeatures")
if __name__ == '__main__':
    conn = S3Connection()
    sc = set_spark_context()
    sqc = SQLContext(sc)
    sm = SparkModel(sc, conn, rdd_path='meta_rdd.pkl')

    logging.basicConfig(format='%(asctime)s %(message)s')
    grid_search = logging.getLogger('main')
    grid_search.setLevel(logging.DEBUG)
    handler = logging.FileHandler('../logs/grid_search.txt')
    grid_search.addHandler(handler)

    bow_rdd = sm.RDD.map(lambda (key, (bow, meta)): (key, bow))
    bow_rdd = sm.RDD.join(sm.target).map(lambda (key, (bow, label)): (label, bow))

    remover = StopWordsRemover(inputCol="raw", outputCol="words")
    hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts",
                numFeatures=10000)
    tfidf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features",
                minDocFreq=20)
    indexer = StringIndexer(inputCol="string_label", outputCol="label")

    for model in [GBTClassifier(), RandomForestClassifier(), MultilayerPerceptronClassifier()]:

        if type(model) == MultilayerPerceptronClassifier:
            layers = [10000, 100, 2]
            model = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128)

        pipeline = Pipeline(stages=[remover, hashingTF, tfidf, # scaler,
                                    indexer, model])
        scores = cross_val_score(pipeline, bow_rdd)
Exemple #18
0
def main(root_path):
    timeStamp = str(int(time()))
    # todo change this for full run
    num = 1000  # 128915 is the total
    out_file_name = '../out/output-' + timeStamp + "-" + str(num) + '.txt'
    out_file = open(out_file_name, 'w')

    start = time()
    spark = init_spark()
    json_files = read_json_files(root_path, spark, num)
    data = get_body_text(spark, json_files)
    print("data reading done")

    # clean the data
    word_clean_up_F = F.udf(lambda x: clean_up(x), StringType())
    data = data.withColumn("body_text_cleaned", word_clean_up_F("body_text"))
    data = data.select("body_text_cleaned")
    print("data processing done")

    tokenizer = Tokenizer(inputCol="body_text_cleaned", outputCol="words")
    token_DataFrame = tokenizer.transform(data)
    token_DataFrame = token_DataFrame.select("words")

    # Remove stopwords
    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    cleaned_DataFrame = remover.transform(token_DataFrame)
    cleaned_DataFrame = cleaned_DataFrame.select('filtered')

    # Count vectorizer
    cv_tmp = CountVectorizer(inputCol="filtered", outputCol="count_features")
    cvmodel = cv_tmp.fit(cleaned_DataFrame)
    count_dataframe = cvmodel.transform(cleaned_DataFrame)
    count_dataframe = count_dataframe.select('count_features')

    # TF-IDF Vectorizer
    tfidf = IDF(inputCol="count_features", outputCol="features")
    tfidfmodel = tfidf.fit(count_dataframe)
    tfidf_dataframe = tfidfmodel.transform(count_dataframe).select("features")

    print("Ready to fit with the LDA model")
    # Fit the LDA Model
    num_topics = 5
    max_iterations = 20
    lda_start = time()
    lda = LDA(seed=1, optimizer="em", k=num_topics, maxIter=max_iterations)
    lda_model = lda.fit(tfidf_dataframe)
    lda_transformed = lda_model.transform(tfidf_dataframe)
    lda_end = time()
    print("LDA complete")
    # joblib.dump(lda_model, 'lda.csv')

    # Get terms per topic
    topics = lda_model.topicsMatrix()
    vocabArray = cvmodel.vocabulary

    wordNumbers = 15  # number of words per topic
    topicIndices = lda_model.describeTopics(maxTermsPerTopic=wordNumbers).rdd.map(tuple)

    topics_final = topicIndices.map(lambda topic: topic_render(topic, wordNumbers, vocabArray)).collect()

    for topic in range(len(topics_final)):
        print("Topic " + str(topic) + ":")
        print("Topic " + str(topic) + ":", file=out_file)
        print(topics_final[topic])
        print(topics_final[topic], file=out_file)

    print("Full runtime : {} min. ".format((time() - start) / 60))
    print("LDA runtime : {} min. ".format((lda_end - lda_start) / 60))
    print("Check" + out_file.name)

    cleaned_DataFrame.cache()
    lda_transformed.cache()

    # Data Visualization
    data = format_data_to_pyldavis(cleaned_DataFrame, cvmodel, lda_transformed, lda_model)
    print("Preparing data with pyLDAvis ...")
    filter_bad_docs(data)
    py_lda_prepared_data = pyLDAvis.prepare(**data)
    file_name = '../out/data-viz-' + timeStamp + '.html'
    print("Saving pyLDAvis html page ...")
    pyLDAvis.save_html(py_lda_prepared_data, file_name)
    pyLDAvis.show(py_lda_prepared_data)
    spark.stop()
Exemple #19
0
tokenized = tokenizer.transform(sentenceDataFrame)
tokenized.select("sentence", "words")\
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)

regexTokenized = regexTokenizer.transform(sentenceDataFrame)
regexTokenized.select("sentence", "words") \
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)

from pyspark.ml.feature import StopWordsRemover

sentenceData = spark.createDataFrame(
    [(0, ["I", "saw", "the", "red", "balloon"]),
     (1, ["Mary", "had", "a", "little", "lamb"])], ["id", "raw"])

remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
remover.transform(sentenceData).show(truncate=False)

from pyspark.ml.feature import NGram

wordDataFrame = spark.createDataFrame(
    [(0, ["Hi", "I", "heard", "about", "Spark"]),
     (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
     (2, ["Logistic", "regression", "models", "are", "neat"])],
    ["id", "words"])

remover = StopWordsRemover(inputCol="words", outputCol="filtered")
word = remover.transform(wordDataFrame).show(truncate=False)
ngram = NGram(n=2, inputCol="filtered", outputCol="ngrams")

ngramDataFrame = ngram.transform(word)
Exemple #20
0
  plt.imshow(wordcloud, interpolation="bilinear")
  plt.axis("off")

# Plot the word cloud:
plot_word_cloud(tokenized, "words")


# ### Remove common (stop) words from each review

# Note that the ride reviews contain a number of common words such as "the"
# that we do not expect to be relevant.
# Use the
# [StopWordsRemover](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.StopWordsRemover)
# class to remove these so-called *stop words*:
from pyspark.ml.feature import StopWordsRemover
remover = StopWordsRemover(inputCol="words", outputCol="relevant_words")
remover.getStopWords()[:10]
removed = remover.transform(tokenized)
removed.select("words", "relevant_words").head(5)

# Plot the word cloud:
plot_word_cloud(removed, "relevant_words")


# ### Count the frequency of words in each review

# Use the
# [CountVectorizer](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.CountVectorizer)
# class to compute the term frequency:
from pyspark.ml.feature import CountVectorizer
vectorizer = CountVectorizer(inputCol="relevant_words", outputCol="word_count_vector", vocabSize=100)
Exemple #21
0
#aprendizajemaquina_df.select(removerPuntuacionNumeros(col('value'))).show(truncate=False)

#####, RegexTokenizer
######### tokenizer
from pyspark.ml.feature import Tokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
tokenizer = Tokenizer(inputCol="value", outputCol="palabras")
aprendizajemaquina_df = tokenizer.transform(aprendizajemaquina_df)
aprendizajemaquina_df.select("value", "palabras").show(6, False)
aprendizajemaquina_df.select("palabras").show(5, False)

######### stopwords
from pyspark.ml.feature import StopWordsRemover

listaConectores = StopWordsRemover.loadDefaultStopWords("spanish")
remover = StopWordsRemover(inputCol="palabras",
                           outputCol="filtro_conectores",
                           stopWords=listaConectores)
aprendizajemaquina_df = remover.transform(aprendizajemaquina_df)
aprendizajemaquina_df.select("palabras").show(5, False)
aprendizajemaquina_df.select("filtro_conectores").show(5, False)

############# Entrenamiento del modelo
###### Palabras a vectores Word2Vec

from pyspark.ml.feature import Word2Vec
from pyspark.ml import Pipeline

w2v = Word2Vec(vectorSize=100,
               minCount=0,
Exemple #22
0
def main(dict):

    filename = dict['filename']
    savedmodelName = dict['modelname']

    def myFunc(input):
        lines = input.split("\n")
        for line in lines:
            parts = line.split(";")
            Category = parts[-1]
            Sentence = parts[1]
            url_pattern = re.compile(r'(http[s]://[\w./]+)*')
            rt_pattern = re.compile('RT @\w+: ')
            r_pattern = re.compile('@\w+ ')
            Sentence = r_pattern.sub(
                r'', rt_pattern.sub(r'',
                                    url_pattern.sub(r'', Sentence))).replace(
                                        '\n', ' ').strip()
        return (Category, Sentence)

    file = sc.textFile("4CVTweets/" + filename)
    lines = file.map(myFunc)
    sentenceDataFrame = spark.createDataFrame(lines, ["label", "sentence"])
    (trainingData, testData) = sentenceDataFrame.randomSplit([0.7, 0.3])
    df = spark.createDataFrame([(0, "NO"), (1, "crash"), (2, "fire"),
                                (3, "shooting")], ["id", "label"])

    # start building the pineline
    # No: 0,Crash:1,Fire:2,Shooting:3

    indexer = StringIndexer(inputCol="label", outputCol="categoryIndex")
    indexer.fit(df)

    tokenizer = RegexTokenizer(pattern="\\w+",
                               inputCol="sentence",
                               outputCol="words",
                               gaps=False)
    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    hashingTF = HashingTF(inputCol="filtered",
                          outputCol="rawFeatures",
                          numFeatures=10000)
    idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)

    # # Compute the Inverse Document Frequency (IDF) given a collection of documents.

    rf = RandomForestClassifier(labelCol="categoryIndex",
                                featuresCol="features",
                                numTrees=100,
                                maxDepth=10)

    # Using randomForest
    # mlr = LogisticRegression(maxIter=100, regParam=0.3, elasticNetParam=0.8, family="multinomial",featuresCol="features",labelCol="categoryIndex")
    # Naive Bayers
    nb = NaiveBayes(labelCol="categoryIndex",
                    featuresCol="features",
                    smoothing=1)

    # converter = IndexToString(inputCol="prediction", outputCol="originalCategory")
    pipeline = Pipeline(
        stages=[indexer, tokenizer, remover, hashingTF, idf, nb])
    model = pipeline.fit(trainingData)

    # Start to count accuracy to evaluate the model using just the offline model

    predictionsForTraining = model.transform(trainingData)

    predictionsForTraining.show(100, False)

    joindf = spark.createDataFrame([(0.0, "NO"), (1.0, "crash"), (2.0, "fire"),
                                    (3.0, "shooting")],
                                   ["prediction", "Predictlabel"])
    innerjoin = predictionsForTraining.join(
        joindf, joindf.prediction == predictionsForTraining.prediction).drop(
            joindf.prediction)

    # innerjoin.select("label","categoryIndex","prediction","Predictlabel").show(1000,False)
    innerjoin.select("label", "Predictlabel").show(1000, False)

    evaluator1 = MulticlassClassificationEvaluator(labelCol="categoryIndex",
                                                   predictionCol="prediction",
                                                   metricName="accuracy")
    accuracy = evaluator1.evaluate(predictionsForTraining)
    print("Test Accuracy = %g " % (accuracy))
    print("Train Error = %g " % (1.0 - accuracy))

    predictions = model.transform(testData)
    evaluator2 = MulticlassClassificationEvaluator(labelCol="categoryIndex",
                                                   predictionCol="prediction",
                                                   metricName="accuracy")

    accuracy = evaluator2.evaluate(predictions)
    print("Test Accuracy = %g " % (accuracy))
    print("Test Error = %g " % (1.0 - accuracy))

    savePath = "tmp/pipeline/" + savedmodelName
    model.write().overwrite().save(savePath)
    print("model for Location", savedmodelName, "save successfully.")
# <h1>Tokenizer on a purticular column</h1>

# In[46]:

tokenized = tokenizer.transform(df_person)
tokenized.select("desc", "words").withColumn("tokens", countTokens(
    col("words"))).show(truncate=False)

# <h3>Stop Word Removal</h3>

# In[55]:

#Load Stop Word Remover
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="words", outputCol="filtered")
remover.transform(tokenized).show(truncate=False)

# <h1>Binary Tokenization Example</h1>

# In[56]:

from pyspark.ml.feature import Binarizer

continuousDataFrame = spark.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.2)],
                                            ["id", "feature"])

binarizer = Binarizer(threshold=0.5,
                      inputCol="feature",
                      outputCol="binarized_feature")
Exemple #24
0
def create_w2v_model():
    spark = SparkSession \
        .builder \
        .appName("SimpleApplication") \
        .config("spark.executor.memory", "2g") \
        .config("spark.driver.memory", "2g") \
        .config("spark.memory.offHeap.enabled", True) \
        .config("spark.memory.offHeap.size", "2g") \
        .getOrCreate()

    input_file = spark.sparkContext.wholeTextFiles(PATH)

    print("""
    
    Подготовка данных (1)...
    
    """)
    prepared_data = input_file.map(lambda x: (x[0], remove_punctuation(x[1])))

    print("""
    
    Подготовка данных (2)...
    
    """)
    df = prepared_data.toDF()

    print("""
    
    Подготовка данных (3)...
    
    """)
    prepared_df = df.selectExpr('_2 as text')

    print("""
    
    Разбитие на токены...
    
    """)
    tokenizer = Tokenizer(inputCol='text', outputCol='words')
    words = tokenizer.transform(prepared_df)

    print("""
    
    Очистка от стоп-слов...
    
    """)
    stop_words = StopWordsRemover.loadDefaultStopWords('russian')
    remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=stop_words)

    print("""
    
    Построение модели...
    
    """)
    word2Vec = Word2Vec(vectorSize=50, inputCol='words', outputCol='result', minCount=2)
    model = word2Vec.fit(words)

    print("""
    
    Сохранение модели...
    
    """)
    today = datetime.datetime.today()
    model_name = today.strftime("model/kurs_model")
    print("""
    
    Model  """ + model_name + """  saved
    
    """)
    model.save(model_name)

    spark.stop()
Exemple #25
0
def Topic_Modeling(tweet_df):

    # Initializing Model for tokenizing the tweets for each user

    tokenizer = Tokenizer(inputCol="concat_ws( , collect_list(tweet))",
                          outputCol="words")

    regexTokenizer = RegexTokenizer(
        inputCol="concat_ws( , collect_list(tweet))",
        outputCol="tokens",
        pattern="\\W+",
        minTokenLength=4)

    # udf for counting tokens

    countTokens = udf(lambda words: len(words), IntegerType())

    # Tokenizing the data

    regexTokenized = regexTokenizer.transform(tweet_df)

    regexTokenized.select("user_id","concat_ws( , collect_list(tweet))", "tokens") \
       .withColumn("token_count", countTokens(col("tokens"))).show()

    #print(regexTokenized.select("user_id","concat_ws( , collect_list(tweet))", "tokens") \
    #    .withColumn("token_count", countTokens(col("tokens"))).toPandas().head())

    # Definig udf for steming use nltk porter stemmer

    p_stemmer = PorterStemmer()

    def stem(x):
        stemmed_tokens = [p_stemmer.stem(i) for i in x]
        return stemmed_tokens

    stem_udf = udf(lambda x: stem(x), ArrayType(StringType()))

    # Stemming tokens
    stemmedTokens.withColumn("Stemmed_tokens", stem_udf('tokens'))

    token_count.select("user_id", "concat_ws( , collect_list(tweet))",
                       "tokens", "Stemmed_tokens").show()

    # Defining model for stopwords

    stopWords_remover = StopWordsRemover(inputCol="Stemmed_tokens",
                                         outputCol="filtered")
    default_StopWords = remover.getStopWords()
    default_StopWords.append("https")

    # Removing Stopwords

    filtered_df = stopWords_remover.transform(regexTokenized)

    filtered_df.withColumn("Pre_tokens",
                           countTokens(col("Stemmed_tokens"))).withColumn(
                               "Post_tokens",
                               countTokens(col("filtered"))).show()

    #print(filtered_df.withColumn("Pre_tokens", countTokens(col("Stemmed_tokens"))).withColumn("Post_tokens", countTokens(col("filtered"))).toPandas().head())

    # Defining model to convert text documents to vectors of token counts

    countVect = CountVectorizer(inputCol="filtered",
                                outputCol="features",
                                vocabSize=1000,
                                minDF=5)

    model = countVect.fit(filtered_df)

    vectorizer = model.transform(filtered_df).select("user_id", "features")
    vectorizer.show(10)

    # Initializing LDA topic Modeling

    lda = LDA(k=10, maxIter=10)
    lda_model = lda.fit(vectorizer)

    #topics = lda_model.topicsMatrix()

    ll = lda_model.logLikelihood(vectorizer)
    lp = lda_model.logPerplexity(vectorizer)
    print("The lower bound on the log likelihood of the entire corpus: " +
          str(ll))
    print("The upper bound on perplexity: " + str(lp))

    # Describe topics.
    topics = lda_model.describeTopics(5)

    print("The topics described by their top-weighted terms:")

    topics.show(truncate=False)

    # UDF for formating the topics for desired usage

    zip_ = udf(
        lambda x, y: list(zip(x, y)),
        ArrayType(
            StructType([
                # Adjust types to reflect data types
                StructField("first", IntegerType()),
                StructField("second", DoubleType())
            ])))

    topics_df=topics.withColumn("tmp", zip_("termIndices", "termWeights")).withColumn("tmp", explode("tmp"))\
        .select("topic", col("tmp.first").alias("termIndices"), col("tmp.second").alias("termWeights"))

    # Extracting documents vocabulary

    vocab = model.vocabulary

    # UDF for extracting words for term indices asssigned to each topic
    words_ = udf(lambda x: vocab[x])
    topics_df = topics_df.withColumn("Words", words_('termIndices'))
    topics_df = topics_df.groupBy("topic").agg(
        collect_list(col("Words")).alias("Words"),
        collect_list(col("termIndices")).alias("termIndices"),
        collect_list(col("termWeights")).alias("termWeights"))
    print("The topics described by their top-weighted terms:")
    print(topics_df.toPandas())

    # Shows the result
    transformed = lda_model.transform(vectorizer)
    transformed.show(truncate=False)

    # UDF to extract top topic for each user

    max_value = udf(lambda x: max(x).item())
    max_index = udf(lambda x: x.argmax().item())

    User_Topics_df = transformed.withColumn(
        "Topic_Prob", max_value("topicDistribution")).withColumn(
            "Topic",
            max_index("topicDistribution")).select("user_id", "Topic_Prob",
                                                   "Topic",
                                                   "topicDistribution")

    User_Topics_df.show(truncate=False)

    # Number of users for each topic assigned to them
    User_Topics_df.groupBy("Topic").count().show()

    # saving userprofile in csv output
    user_profile = User_Topics_df.join(topics_df,
                                       User_Topics_df.Topic == topics_df.topic)
    user_profile.toPandas().to_csv('user_profile.csv')
        tmp.append(i.value.split("||")[0])
        tmp.append(i.value.split("||")[1])
        data.append(tmp)

    print(len(data))
    df = sqlContext.createDataFrame(data, schema=["category", "text"])

    # regular expression tokenizer
    regex_tokenizer = RegexTokenizer(inputCol="text",
                                     outputCol="words",
                                     pattern="\\W")

    # stop words
    stop_words = list(set(stopwords.words('english')))

    stop_words_remover = StopWordsRemover(
        inputCol="words", outputCol="filtered").setStopWords(stop_words)

    # bag of words count
    count_vectors = CountVectorizer(inputCol="filtered",
                                    outputCol="features",
                                    vocabSize=10000,
                                    minDF=5)
    label_string_index = StringIndexer(inputCol="category", outputCol="label")
    label_string_index.setHandleInvalid("keep")

    pipeline = Pipeline(stages=[
        regex_tokenizer, stop_words_remover, count_vectors, label_string_index
    ])
    (training_data, test_data) = df.randomSplit([0.8, 0.2], seed=100)
    pipeline_fit = pipeline.fit(training_data)
    pipeline_fit.save("lr_pipeline")
def login():
    message = ''
    e_result = ''
    s_result = ''
    t_result = ''
    j_result = ''

    if request.method == 'POST':
        post = request.form.get('text')  # access the data inside

        if len(post) >= 100:

            test = pd.DataFrame([post], columns=['post'])

            newrows = []

            def filter_text(post):
                """Decide whether or not we want to use the post."""
                # should remove link only posts here
                return len(post) > 0

            reg_punc = re.compile('[%s]' % re.escape(string.punctuation))

            def preprocess_text(post):
                """Remove any junk we don't want to use in the post."""

                # Remove links
                post = re.sub(r'http\S+', '', post, flags=re.MULTILINE)

                # All lowercase
                post = post.lower()

                # Remove puncutation
                post = reg_punc.sub('', post)

                return post

            def create_new_rows(row):
                posts = row['post']
                rows = []

                # for p in posts:
                p = preprocess_text(posts)
                rows.append({'post': p})
                return rows

            for index, row in test.iterrows():
                newrows += create_new_rows(row)

            test = pd.DataFrame(newrows)

            df = spark.createDataFrame(test)

            # Create a length column to be used as a future feature
            df = df.withColumn('length', length(df['post']))

            types = [
                'INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
                'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ'
            ]
            types = [x.lower() for x in types]

            tokenizer = Tokenizer(inputCol="post", outputCol="words")
            tokenized = tokenizer.transform(df)

            # Remove stop words
            stopwordList = types
            stopwordList.extend(StopWordsRemover().getStopWords())
            stopwordList = list(set(stopwordList))  #optionnal
            remover = StopWordsRemover(inputCol="words",
                                       outputCol="filtered",
                                       stopWords=stopwordList)
            newFrame = remover.transform(tokenized)

            # Run the hashing term frequency
            hashing = HashingTF(inputCol="filtered", outputCol="hashedValues")
            # Transform into a DF
            hashed_df = hashing.transform(newFrame)

            # Fit the IDF on the data set
            idf = IDF(inputCol="hashedValues", outputCol="idf_token")
            idfModel = idf.fit(hashed_df)
            rescaledData = idfModel.transform(hashed_df)

            # Create feature vectors
            #idf = IDF(inputCol='hash_token', outputCol='idf_token')
            clean_up = VectorAssembler(inputCols=['idf_token', 'length'],
                                       outputCol='features')
            output = clean_up.transform(rescaledData)

            ei_model = NaiveBayesModel.load("static/models/EI_Predictor.h5")
            sn_model = NaiveBayesModel.load("static/models/SN_Predictor.h5")
            tf_model = NaiveBayesModel.load("static/models/TF_Predictor.h5")
            jp_model = NaiveBayesModel.load("static/models/JP_Predictor.h5")

            test_e = ei_model.transform(output)
            e = test_e.toPandas()["prediction"].values[0]
            if e == 0:
                e_result = "I"
            else:
                e_result = "E"
            test_s = sn_model.transform(output)
            s = test_s.toPandas()["prediction"].values[0]
            if s == 0:
                s_result = "N"
            else:
                s_result = "S"
            test_t = tf_model.transform(output)
            t = test_t.toPandas()["prediction"].values[0]
            if t == 0:
                t_result = "F"
            else:
                t_result = "T"
            test_j = jp_model.transform(output)
            j = test_j.toPandas()["prediction"].values[0]
            if j == 0:
                j_result = "P"
            else:
                j_result = "J"

        else:
            message = "Please tell us more about yourself!"

    return render_template('index.html',
                           message=message,
                           test_e=e_result,
                           test_s=s_result,
                           test_t=t_result,
                           test_j=j_result)
# COMMAND ----------

from pyspark.ml.feature import RegexTokenizer
rt = RegexTokenizer()\
  .setInputCol("Description")\
  .setOutputCol("DescOut")\
  .setPattern(" ")\
  .setGaps(False)\
  .setToLowercase(True)
rt.transform(sales.select("Description")).show(20, False)


# COMMAND ----------

from pyspark.ml.feature import StopWordsRemover
englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
stops = StopWordsRemover()\
  .setStopWords(englishStopWords)\
  .setInputCol("DescOut")
stops.transform(tokenized).show()


# COMMAND ----------

from pyspark.ml.feature import NGram
unigram = NGram().setInputCol("DescOut").setN(1)
bigram = NGram().setInputCol("DescOut").setN(2)
unigram.transform(tokenized.select("DescOut")).show(False)
bigram.transform(tokenized.select("DescOut")).show(False)

Exemple #29
0
def main():

    # read data
    yahoo = spark.read.csv(f'{BUILDDIR}/yahoo.csv', header=True)
    data = yahoo.select(['sector', 'description']).dropna()

    # tokenize texts based on regular expression
    tokenize = RegexTokenizer(inputCol='description',
                              outputCol='words_all',
                              pattern='\\W')

    # remove stop words
    stopwords = '\n'.join((DATADIR / 'stopwords' / f).read_text().strip()
                          for f in ('mysql.txt', 'nltk.txt')).splitlines()
    remove_stopwords = StopWordsRemover(
        inputCol='words_all', outputCol='words_clean').setStopWords(stopwords)

    # get words frequency using simple count (bag of words)
    add_wordcount = CountVectorizer(inputCol='words_clean',
                                    outputCol='words_count',
                                    vocabSize=10000,
                                    minDF=5)

    # get tf-idf words frequencies

    add_wordtf = HashingTF(inputCol='words_clean',
                           outputCol='words_tf',
                           numFeatures=10000)
    add_wordidf = IDF(inputCol='words_tf',
                      outputCol='words_tfidf',
                      minDocFreq=5)

    # prepare output values
    index_target = StringIndexer(inputCol='sector', outputCol='label')

    # data preparation pipeline
    pipeline_wordcount = Pipeline(stages=[
        tokenize,
        remove_stopwords,
        add_wordcount,
        add_wordtf,
        add_wordidf,
        index_target,
    ])
    # apply data preparation pipeline
    prepared = pipeline_wordcount.fit(data).transform(data)

    # split to training and testing
    training, testing = prepared.randomSplit([0.7, 0.3], seed=100500)

    # fit logistic regression models

    logistic_wordcount = LogisticRegression(regParam=0.3,
                                            elasticNetParam=0,
                                            featuresCol='words_count',
                                            labelCol='label',
                                            predictionCol='prediction',
                                            probabilityCol='probability')

    logistic_tfidf = LogisticRegression(regParam=0.3,
                                        elasticNetParam=0,
                                        featuresCol='words_tfidf',
                                        labelCol='label',
                                        predictionCol='prediction',
                                        probabilityCol='probability')

    for model, name in ((logistic_wordcount,
                         'Word count + Logistic regression'),
                        (logistic_tfidf, 'TF-IDF + Logistic regression')):
        predicted = model.fit(training).transform(testing)
        evaluator = MulticlassClassificationEvaluator(
            predictionCol='prediction', metricName='accuracy')
        print(f'{name} model accuracy = {evaluator.evaluate(predicted)}')
#### DATA DRIVEN APPROACH
#### DATA DRIVEN APPROACH


# In[21]:

# we obtain the stop words from a website
import requests
stop_words = requests.get('http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words').text.split()
len(stop_words)


# In[22]:

from pyspark.ml.feature import StopWordsRemover
sw_filter = StopWordsRemover()  .setStopWords(stop_words)  .setCaseSensitive(False)  .setInputCol("words")  .setOutputCol("filtered")


# In[23]:

from pyspark.ml.feature import CountVectorizer

# we will remove words that appear in 5 docs or less
cv = CountVectorizer(minTF=1., minDF=5., vocabSize=2**17)  .setInputCol("filtered")  .setOutputCol("tf")


# In[24]:

# we now create a pipelined transformer
cv_pipeline = Pipeline(stages=[tokenizer, sw_filter, cv]).fit(airportCleanDF)
    return spark.createDataFrame(row_rdd, ["label", "text"])

##
## Define the pipeline stages
##

## Tokenize the messages
tokenizer = RegexTokenizer(inputCol="text",
                           outputCol="words",
                           minTokenLength=3,
                           gaps=False,
                           pattern="[a-zA-Z]+")

## Remove ignored words
stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                    outputCol="filtered",
                                    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
                                    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(),
          outputCol="tf_idf",
          minDocFreq=4)


if algo == "gbm":
    ## Create GBM model
Exemple #32
0
joinDF = clicksDF.join(jobsDF, clicksDF._3 == jobsDF._1, "inner")

jobsfeatures = joinDF.map(lambda x: (cleanhtml(x[5] + ' ' + x[6]), x[
    0]))  # concatenate job title and description, the 2nd is click ID
jobsfeaturesDF = sqlContext.createDataFrame(jobsfeatures)
# or jobsfeaturesDF =jobsfeatures.toDF()
# Here if everything went well you see the no more HTML if running the command jobsfeaturesDF.take(1) to get 1 record

# tokenizer to create a "terms" column so for example:
# from _1 we have terms
tokenizer = Tokenizer(inputCol="_1", outputCol="terms")
termsData = tokenizer.transform(jobsfeaturesDF)

# remover to remove stop words that don't contribute so for example
# from terms we have filtered
remover = StopWordsRemover(inputCol="terms", outputCol="filtered")
filteredTermsData = remover.transform(termsData)

# http://spark.apache.org/docs/latest/ml-features.html
# Both HashingTF and CountVectorizer can be used to generate the term frequency vectors.
# HashingTF is a Transformer which takes sets of terms and converts those sets into fixed-length feature vectors. In text processing, a “set of terms” might # be a bag of words. HashingTF utilizes the hashing trick.
# so from filtered we have rawFeatures
tf = HashingTF(inputCol="filtered",
               outputCol="rawFeatures").transform(filteredTermsData)

# IDF: IDF is an Estimator which is fit on a dataset and produces an IDFModel. The IDFModel takes feature vectors (generally created from HashingTF or
# CountVectorizer) and scales each column. Intuitively, it down-weights columns which appear frequently in a corpus.
idf = IDF(inputCol="rawFeatures", outputCol="features").fit(tf)

# TF-IDF. Use tfidf.take(1) to see the 1st record
tfidf = idf.transform(tf)
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)

regexTokenized = regexTokenizer.transform(sentenceDataFrame)
regexTokenized.select("sentence", "words") \
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)

# StopWordsRemover

from pyspark.ml.feature import StopWordsRemover

sentenceData = spark.createDataFrame([
    (0, ["I", "saw", "the", "red", "balloon"]),
    (1, ["Mary", "had", "a", "little", "lamb"])
], ["id", "raw"])

remover = StopWordsRemover(inputCol="raw", outputCol="removeded")
remover.transform(sentenceData).show(truncate=False)

# NGram
from pyspark.ml import Pipeline
from pyspark.ml.feature import IDF, Tokenizer
from pyspark.ml.feature import NGram

sentenceData = spark.createDataFrame([
    (0.0, "I love Spark"),
    (0.0, "I love python"),
    (1.0, "I think ML is awesome")],
    ["label", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
ngram = NGram(n=2, inputCol="words", outputCol="ngrams")
idf = IDF(inputCol="rawFeatures", outputCol="features")
Exemple #34
0
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *
import pandas as pd
import pickle

ICD9CODES = pickle.load(open("./data/ICD9CODES.p", "r"))
ICD9CODES_TOP10 = pickle.load(open("./data/ICD9CODES_TOP10.p", "r"))
ICD9CODES_TOP50 = pickle.load(open("./data/ICD9CODES_TOP50.p", "r"))
ICD9CAT_TOP10 = pickle.load(open("./data/ICD9CAT_TOP10.p", "r"))
ICD9CAT_TOP50 = pickle.load(open("./data/ICD9CAT_TOP50.p", "r"))

from pyspark.ml.feature import StopWordsRemover
STOPWORDS_v0 = StopWordsRemover.loadDefaultStopWords("english") + ICD9CODES
STOPWORDS_v0 = [str(i) for i in STOPWORDS_v0]

# print "TFIDF v0 stop words"
# print STOPWORDS_v0

from pyspark.ml.feature import HashingTF, IDF, RegexTokenizer, StopWordsRemover


def create_TFIDF_v0(trainData,
                    applyData,
                    inputCol="text",
                    outputCol="features",
                    minDocFreq=3,
                    numFeatures=20):
    tokenizer = RegexTokenizer(pattern="[.:\s]+",
                               inputCol=inputCol,
                               outputCol="z_words")
    wordsData1 = tokenizer.transform(trainData)
Exemple #35
0
# Create contexts
sc = SparkContext(appName="SparkWorkshop")
sqlContext = SQLContext(sc)

# Set up user defined functions and object for transformations
expression = re.compile(r'<.*?>')
parser = HTMLParser.HTMLParser()

def strip_tags(html):
    return parser.unescape(
        expression.sub('', html)
        )

strip_tags_udf = udf(strip_tags)
tokenizer = Tokenizer(inputCol="comment_clean", outputCol="words")
stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="tokens")

# Load data
comments = sqlContext.read.json(fn)

# Calcualte tokens dataframe as one pipeline
tokens = stopWordsRemover.transform(
             tokenizer.transform(comments\
                 .withColumn("comment_clean", strip_tags_udf(comments["comment_text"]))\
             )\
         )\
         .select(explode("tokens").alias("token"))\
         .groupBy("token")\
         .count()\
         .orderBy("count", ascending=False)\
         .select("count")\
Exemple #36
0
# %%
#Computing setniment column based on rating
sentiment = when(col("rating") <= 5, 0).otherwise(1)

df = df.withColumn("sentiment", sentiment)
df = df.withColumn('length', length(df['review']))

# %% [markdown]
# ## Feature Transformation

# %%
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer

tokenizer = Tokenizer(inputCol="review", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='c_vec')
idf = IDF(inputCol="c_vec", outputCol="tf_idf")
pos_neg = StringIndexer(inputCol='sentiment', outputCol='label')

# %%
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# %%
clean_up = VectorAssembler(inputCols=['tf_idf', 'length'],
                           outputCol='features')

# %%
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import LinearSVC
Exemple #37
0
    pos_rdd = pos.map(
        lambda p: Row(text=p.encode('utf-8').strip(), label=float(1.0)))
    neg_rdd = neg.map(
        lambda n: Row(text=n.encode('utf-8').strip(), label=float(0.0)))

    pos_all = spark.createDataFrame(pos_rdd).withColumn(
        "label", lit(1.0)).withColumn("id", monotonically_increasing_id())
    neg_all = spark.createDataFrame(neg_rdd).withColumn(
        "label", lit(0.0)).withColumn("id", monotonically_increasing_id())

    training = pos_all.unionAll(neg_all)

    # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                               outputCol="relevant_words")
    hashingTF = HashingTF(inputCol=remover.getOutputCol(),
                          outputCol="rawFeatures")
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    #countVector = CountVectorizer(inputCol="relevant_words", outputCol="features", vocabSize=10000, minDF=2.0)
    #    lr = LogisticRegression(maxIter=10, regParam=0.001)
    nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
    #    pipeline_lr = Pipeline(stages=[tokenizer, remover, hashingTF, idf, lr])
    pipeline_nb = Pipeline(stages=[tokenizer, remover, hashingTF, idf, nb])

    # Fit the pipeline to training documents.
    #    model_lr = pipeline_lr.fit(training)
    model_nb = pipeline_nb.fit(training)

    # Reading data from kafka-topic1
 def preprocess_tweets(tweets):
     tokenizer = Tokenizer(inputCol="text", outputCol="words")
     tweets = tokenizer.transform(tweets)
     remover = StopWordsRemover(inputCol="words", outputCol="filtered")
     tweets = remover.transform(tweets)
     return tweets
Exemple #39
0
def do_query(issues, config_file=None, logger=None, context=None):
    """
    Gets the Latent Dirochelet Allocation (LDA) topics for words
    within articles.

    config_file must be the path to a LDA configuration file in YAML
    format. For example:

        keyword: <KEYWORD>
        optimizer: online|em
        max_iterations: <N>
        ntopics: <N>
        topic_words: <N>

    <N> must be >= 1 for each parameter.

    The keyword and words in documents are normalized, by removing all
    non-'a-z|A-Z' characters.

    Returns result of form:

        {
          <0>: [<WORD_0>, ..., <WORD_topicwords>],
          <1>: [<WORD_0>, ..., <WORD_topicwords>],
          <2>: [<WORD_0>, ..., <WORD_topicwords>],
          ...
          <ntopics>: [<WORD_0>, ..., <WORD_topicwords>],
          years:[<MIN_YEAR>, <MAX_YEAR>]
        }

    :param issues: RDD of defoe.papers.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: LDA topics
    :rtype: dict
    """
    with open(config_file, 'r') as f:
        config = load(f)
        keyword = config['keyword']
        optimizer = config['optimizer']
        if optimizer != 'online' and optimizer != 'em':
            raise ValueError("optmizer must be 'online' or 'em' but is '{}'"
                             .format(optimizer))
        max_iterations = config['max_iterations']
        if max_iterations < 1:
            raise ValueError('max_iterations must be at least 1')
        ntopics = config['ntopics']
        if ntopics < 1:
            raise ValueError('ntopics must be at least 1')
        topic_words = config['topic_words']
        if topic_words < 1:
            raise ValueError('topic_words must be at least 1')

    keyword = query_utils.normalize(keyword)

    # [date, ...]
    # =>
    # [(yesr, year), ...]
    # =>
    # (year, year)
    min_year, max_year = issues \
        .filter(lambda issue: issue.date) \
        .map(lambda issue: (issue.date.year, issue.date.year)) \
        .reduce(min_max_tuples)

    # [issue, issue, ...]
    # =>
    # [article, article, ...]
    # =>
    # [(article, 0), (article, 1), ...]
    # =>
    # [Row, Row, ...]
    articles_rdd = issues.flatMap(lambda issue: issue.articles) \
        .filter(lambda article:
                article_contains_word(article,
                                      keyword,
                                      PreprocessWordType.NORMALIZE)) \
        .zipWithIndex() \
        .map(article_idx_to_words_row)

    spark = SparkSession \
        .builder \
        .appName('lda') \
        .getOrCreate()

    articles_df = spark.createDataFrame(articles_rdd)

    remover = StopWordsRemover(inputCol='words', outputCol='filtered')
    articles_df = remover.transform(articles_df)

    vectortoriser = CountVectorizer(inputCol='filtered', outputCol='vectors')
    model = vectortoriser.fit(articles_df)

    vocabulary = model.vocabulary
    articles_df = model.transform(articles_df)

    corpus = articles_df \
        .select('idx', 'vectors') \
        .rdd \
        .map(lambda a: [a[0], Vectors.fromML(a[1])]) \
        .cache()

    # Cluster the documents into N topics using LDA.
    lda_model = LDA.train(corpus,
                          k=ntopics,
                          maxIterations=max_iterations,
                          optimizer=optimizer)
    topics_final = [topic_render(topic, topic_words, vocabulary)
                    for topic in lda_model.describeTopics(maxTermsPerTopic=topic_words)]

    topics = [('years', [min_year, max_year])]
    for i, topic in enumerate(topics_final):
        term_words = []
        for term in topic:
            term_words.append(term)
        topics.append((str(i), term_words))
    return topics
Exemple #40
0
from pyspark.sql import SQLContext
from pyspark.sql.functions import desc, explode
from pyspark.sql.types import *
from storage import Sqlite

PARTITIONS = 500
THRESHOLD = 50

if __name__ == "__main__":
    conf = SparkConf().setAppName("reddit")
    conf.set('spark.serializer', 'org.apache.spark.serializer.KryoSerializer')
    conf.set('spark.local.dir', '/mnt/work')
    conf.set('spark.driver.maxResultSize', '12g')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    fields = [StructField("subreddit", StringType(), True),
          StructField("body", StringType(), True)]
    rawDF = sqlContext.read.json("file:///mnt/s3/2015/*", StructType(fields))
    # split comments into words
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsDataFrame = tokenizer.transform(rawDF)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    filteredDataFrame = remover.transform(wordsDataFrame)
    # explode terms into individual rows
    termDataFrame = filteredDataFrame.select(['subreddit', explode(filteredDataFrame.filtered).alias("term")])
    # group by subreddit and term, then count occurence of term in subreddits
    countsDataFrame = termDataFrame.groupBy(['subreddit', 'term']).count()

    db =  Sqlite()
    countsDataFrame.select(['subreddit', 'term', 'count']).filter('count > {}'.format(THRESHOLD)).foreachPartition(db.saveSubredditWords)
Exemple #41
0
    sys.exit(1)

# Transform data into ready-format
df_train = (text_data.fillna("").select(
    concat(col("title"), lit(" "), col("abstract"), lit(" "),
           col("full_text")).alias("text")))

# Create pipeline objects
document_assembler = DocumentAssembler().setInputCol("text").setOutputCol(
    "document")
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")
normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normalizer")
stemmer = Stemmer().setInputCols(["normalizer"]).setOutputCol("stem")
finisher = Finisher().setInputCols(["stem"]).setOutputCols(
    ["to_spark"]).setValueSplitSymbol(" ")
stopword_remover = StopWordsRemover(inputCol="to_spark", outputCol="filtered")
tf = CountVectorizer(inputCol="filtered", outputCol="raw_features")
idf = IDF(inputCol="raw_features", outputCol="features")
lda = LDA(k=10, maxIter=10)

# Create pipeline
pipeline = Pipeline(stages=[
    document_assembler, tokenizer, normalizer, stemmer, finisher,
    stopword_remover, tf, idf, lda
])

model = pipeline.fit(df_train)
vocab = model.stages[-3].vocabulary
raw_topics = model.stages[-1].describeTopics().collect()
topic_inds = [ind.termIndices for ind in raw_topics]
Exemple #42
0
    "spark.sql.warehouse.dir",
    '/user/hive/warehouse').enableHiveSupport().getOrCreate()

data = sc.textFile(trainingData)

header = data.first()
rdd = data.filter(lambda row: row != header)

r = rdd.mapPartitions(lambda x: csv.reader(x))
r = r.map(lambda x: (processTweet(x[3]), int(x[1])))

r = r.map(lambda x: Row(sentence=x[0], label=int(x[1])))
df = spark.createDataFrame(r).orderBy(rand()).limit(500000)

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="base_words")
hashingTF = HashingTF(numFeatures=10000,
                      inputCol="base_words",
                      outputCol="features")

lr = LogisticRegression(maxIter=10000, regParam=0.001, elasticNetParam=0.0001)

pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, lr])

splits = df.randomSplit([0.6, 0.4], 223)
trainSet = splits[0]
testSet = splits[1]

lrModel = pipeline.fit(trainSet)
lrResult = lrModel.transform(testSet)
df = spark.read.option('delimiter', '\t').csv('items_data.tsv', header=True)

# Insert items id
df = df.withColumn("id", monotonically_increasing_id())

# Init the tokenizer
tokenizerR = RegexTokenizer(inputCol='description', outputCol='tokenized', pattern='\\W')

# Tokenize the item description text
df = tokenizerR.transform(df)

# Create the italian stopwords collection
stop_words = list(stopwords.words('italian'))

# Remove the italian stopwords from the item description
remover = StopWordsRemover(inputCol="tokenized", outputCol="stopwords_removed", stopWords=stop_words)
df = remover.transform(df)

# Last preprocess operations
ita_stemmer = ItalianStemmer()
eng_stemmer = PorterStemmer()

# Make the last preprocessing operations
def text_preprocessing(tokens):
    # Remove tokens composed by only a number
    filtered_token = [token for token in tokens if not re.search(r'\b[0-9]+\b\s*', token)]

    # Stem the tokens for both italian and english
    filtered_token = [ita_stemmer.stem(token) for token in filtered_token]
    filtered_token = [eng_stemmer.stem(token) for token in filtered_token]
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import StopWordsRemover
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("StopWordsRemoverExample")\
        .getOrCreate()

    # $example on$
    sentenceData = spark.createDataFrame([
        (0, ["I", "saw", "the", "red", "balloon"]),
        (1, ["Mary", "had", "a", "little", "lamb"])
    ], ["id", "raw"])

    remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
    remover.transform(sentenceData).show(truncate=False)
    # $example off$

    spark.stop()
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, '[0-9]', ' '))

# Merge multiple spaces
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, ' +', ' '))

# Split the text into words
wrangled = Tokenizer(inputCol='text', outputCol='words').transform(wrangled)

wrangled.show(4, truncate=False)

--------------------------------------------------
# Exercise_11 
from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF

# Remove stop words.
wrangled = StopWordsRemover(inputCol='words', outputCol='terms')\
      .transform(sms)

# Apply the hashing trick
wrangled = HashingTF(inputCol='terms', outputCol='hash', numFeatures=1024)\
      .transform(wrangled)

# Convert hashed symbols to TF-IDF
tf_idf = IDF(inputCol='hash', outputCol='features')\
      .fit(wrangled).transform(wrangled)
      
tf_idf.select('terms', 'features').show(4, truncate=False)

--------------------------------------------------
# Exercise_12 
# Split the data into training and testing sets
Exemple #46
0
# using 1000 records as a small set debugging data
train_sents1 = train_df.select('genre', 'sentence1')
train_sents2 = train_df.select('genre', 'sentence2')
# train_sents1.show(5)

udf_lower = F.udf(lower_folding, StringType())
train_sents1_lower = train_sents1.withColumn('lower_sents',
                                             udf_lower('sentence1'))
# train_sents1_lower.show(5)

udf_rv_punc = F.udf(remove_punctuation_re, StringType())
train_sents1_rv_punc = train_sents1_lower.withColumn(
    'rv_punc_sents', udf_rv_punc('lower_sents'))

tokenizer = Tokenizer(inputCol="rv_punc_sents", outputCol="tokens")
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
w2v = Word2Vec(vectorSize=300,
               minCount=0,
               inputCol="filtered_tokens",
               outputCol="avg_word_embed")

doc2vec_pipeline = Pipeline(stages=[tokenizer, remover, w2v])
doc2vec_model = doc2vec_pipeline.fit(train_sents1_rv_punc)
doc2vecs_df = doc2vec_model.transform(train_sents1_rv_punc)
w2v_train_df, w2v_test_df = doc2vecs_df.randomSplit([0.8, 0.2])

from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

genre2label = StringIndexer(inputCol="genre", outputCol="label")
Exemple #47
0
"""
from pyspark.ml.feature import StopWordsRemover
sentenceDataFrame= spark.createDataFrame([
    (0,['I','saw','the','green','horse']),
    (1,['Mary','had','a','little','lamb'])
],['id','tokens'])
sentenceDataFrame.show()
"""
+---+--------------------+
| id|              tokens|
+---+--------------------+
|  0|[I, saw, the, gre...|
|  1|[Mary, had, a, li...|
+---+--------------------+
"""
remover= StopWordsRemover(inputCol='tokens',outputCol='filtered')
remover.transform(sentenceDataFrame).show()
"""
+---+--------------------+--------------------+
| id|              tokens|            filtered|
+---+--------------------+--------------------+
|  0|[I, saw, the, gre...| [saw, green, horse]|
|  1|[Mary, had, a, li...|[Mary, little, lamb]|
+---+--------------------+--------------------+
"""
# n-gram
from pyspark.ml.feature import NGram
wordDataFrame= spark.createDataFrame([
    (0,["Hi", "I", "heard", "about", "Spark"]),
    (1,["I", "wish", "java", "could", "use", "case", "classes"]),
    (2,["Logistic", "regression", "models", "are", "neat"]),