コード例 #1
0
ファイル: LDA_pyspark.py プロジェクト: bz10bis/FinalProject
def LDA_Treatment(str):
    finalTopics = []
    txt = wordTokenize(str)
    data = sc.parallelize([txt]).zipWithIndex().map(lambda val: Row(idd=val[1], _words=val[0].split(" ")))
    docDF = spark.createDataFrame(data, ["_words"])
    Vector = CountVectorizer(inputCol="_words", outputCol="vectors")
    model = Vector.fit(docDF)
    result = model.transform(docDF)
    corpus = result.select("idd", "vectors").rdd.map(lambda val: [val[0], Vectors.fromML(val[1])]).cache()
    ldaModel = LDA.train(corpus, k=nbTopics, maxIterations=1000, optimizer='online')
    topics = ldaModel.topicsMatrix()
    vocabArray = model.vocabulary
    topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic=wordNumbers))
    topics_final = topicIndices.map(lambda topic: topic_render(topic, vocabArray)).collect()

    for topic in range(len(topics_final)):
        for term in topics_final[topic]:
            term = unidecode.unidecode(term)
            finalTopics.append(term)

    return finalTopics
コード例 #2
0
    #     new_tweet= ''
    #     for tok in tokens:
    #         if tok not in stop_words or tok[0] != '@' or tok[0] != '#' or tok[:4] != 'http':
    #             new_tweet += ' '+tok
    #     new_tweet.strip()
    #     tweets.append(new_tweet)
    # f.close()
    # fd = codecs.open('cleaned_example.txt', 'w', encoding = 'utf-8')
    # for tweet in tweets:
    #     fd.write(tweet+'\n')

    rdd = sc.textFile('opinion.txt').zipWithIndex().map(
        lambda (words, idd): Row(idd=idd, words=words.split(" ")))
    docDF = spark.createDataFrame(rdd)
    Vector = CountVectorizer(inputCol="words", outputCol="vectors")
    model = Vector.fit(docDF)
    result = model.transform(docDF)

    corpus = result.select(
        "idd",
        "vectors").rdd.map(lambda (x, y): [x, Vectors.fromML(y)]).cache()

    # Cluster the documents into three topics using LDA
    ldaModel = LDA.train(corpus, k=5, maxIterations=100, optimizer='online')
    topics = ldaModel.topicsMatrix()
    vocabArray = model.vocabulary

    wordNumbers = 5  # number of words per topic
    topicIndices = sc.parallelize(
        ldaModel.describeTopics(maxTermsPerTopic=wordNumbers))
コード例 #3
0
from pyspark.sql import SQLContext, Row
from pyspark.ml.feature import CountVectorizer

#http://spark.apache.org/docs/latest/mllib-clustering.html#latent-dirichlet-allocation-lda
# Load and parse the data
#data = to_list[0:1000]
from pyspark.sql.types import StringType
from pyspark.sql.functions import *

corpus_df = spark.createDataFrame(data, StringType())
corpus_df = corpus_df.withColumn("index",monotonically_increasing_id())
corpus_df = corpus_df.withColumn("arrayColumn", array("value"))
#data = sc.textFile(path).zipWithIndex().map(lambda (words,idd): Row(idd= idd, words = words.split(" ")))
#docDF = spark.createDataFrame(data)
Vector = CountVectorizer(inputCol="arrayColumn", outputCol="vectors")
model = Vector.fit(corpus_df)
result = model.transform(corpus_df)

# Cluster the documents into three topics using LDA
ldaModel = LDA.train(data, k=10)

#ldaModel.save(sc, "target/org/apache/spark/PythonLatentDirichletAllocationExample/LDAModel")
#sameModel = LDAModel\
#    .load(sc, "target/org/apache/spark/PythonLatentDirichletAllocationExample/LDAModel")

# COMMAND ----------

num_topics = 10
max_iterations = 100

lda_model = LDA.train(result[['index','vectors']], k=num_topics, maxIterations=max_iterations)
コード例 #4
0
        lambda (words, idd): Row(idd=idd, words=words.split(' ')))
    docDF = sqlContext.createDataFrame(parsedData)
    # docDF.show()

    # stopword remover
    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    remover.transform(docDF).show(truncate=False)

    Vector = CountVectorizer(inputCol="filtered", outputCol="vectors")

    pipeline = Pipeline(stages=[remover])

    model_prep = pipeline.fit(docDF)
    result = model_prep.transform(docDF)

    model = Vector.fit(result)
    result = model.transform(result)
    result.show()

    corpus = result.select(
        "idd",
        "vectors").rdd.map(lambda (x, y): [x, Vectors.fromML(y)]).cache()

    # Cluster the documents into six topics using LDA
    ldaModel = LDA.train(corpus, k=6, maxIterations=100, optimizer='online')
    topics = ldaModel.topicsMatrix()

    vocabArray = model.vocabulary

    topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic=20))