def LDA_spark(): data = sc.textFile("data/mllib/sample_lda_data.txt") parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')])) corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() # Cluster the documents into three topics using LDA Model = LDA.train(corpus, k=3) # Save and load model Model.save(sc, "myModelPath") sameModel = LDAModel.load(sc, "myModelPath")
def main(): for tn in tablenames: data = spark.read.format("org.apache.spark.sql.cassandra")\ .options(table=tn, keyspace=keyspace).load().limit(1000) data = data.sort('imdb_score', ascending=False) desc = data.rdd.map(lambda x: x['description']).filter( lambda x: x is not None) StopWords = nltk.corpus.stopwords.words('english') StopWords.extend([" ... See full summary"]) tokenized = desc.map( lambda y: y.strip().lower()).map( lambda x: re.split(" ", x))\ .map( lambda word: [x for x in word if x.isalpha()]).map( lambda word: [x for x in word if len(x) > 3] )\ .map( lambda word: [x for x in word if x not in StopWords]).zipWithIndex() df_txts = spark.createDataFrame(tokenized, ["words", 'index']) countVec = CountVectorizer(inputCol="words", outputCol="raw_features", vocabSize=5000, minDF=10.0) CountVectMod = countVec.fit(df_txts) result = CountVectMod.transform(df_txts) idf = IDF(inputCol="raw_features", outputCol="features") idfModel = idf.fit(result) resultTFIdf = idfModel.transform(result) totalTopics = 10 totalItr = 100 LDAModel = MLlibLDA.train(resultTFIdf.select('index','features').rdd.mapValues(MLlibVectors.fromML).map(list),\ k=totalTopics, maxIterations=totalItr) maxwordsTopic = 5 topicIndices = sc.parallelize( LDAModel.describeTopics(maxTermsPerTopic=5)) VCarr = CountVectMod.vocabulary def finalTopic(topic): terms = topic[0] result = [] for i in range(maxwordsTopic): term = VCarr[terms[i]] result.append(term) return result topics_final = topicIndices.map( lambda topic: finalTopic(topic)).collect() print(topics_final) for topic in range(len(topics_final)): print("Topic" + str(topic) + ":") for term in topics_final[topic]: print(term) print('\n')
def test(): sc = SparkContext(master='local[4]', appName='lda') sc.setLogLevel('ERROR') def train(): data = sc.textFile(corpus_filename).map(lambda line: Vectors.dense( [float(i) for i in line.strip().split()])) corpus = data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() # print(corpus.take(5)) lda_model = LDA.train(rdd=corpus, maxIterations=max_iter, seed=seed, checkpointInterval=checkin_point_interval, k=K, optimizer=optimizer, docConcentration=alpha, topicConcentration=beta) if os.path.exists('./ldamodel'): __import__('shutil').rmtree('./ldamodel') lda_model.save(sc, "./ldamodel") # train() lda_model = LDAModel.load(sc, "./ldamodel") # topic-word分布(未归一化的dist,每列代表一个topic) topics = lda_model.topicsMatrix() # for tid in range(3): # print('Topic' + str(tid) + ':') # for wid in range(0, lda_model.vocabSize()): # print(' ' + str(topics[wid, tid] / sum(topics[:, tid]))) # 加一个归一化 # # print(' ' + str(topics[wid, tid])) # topic-word按词序排列分布([词id,按权重从大到小排列], [词在主题上的权重]) topics_dist = lda_model.describeTopics() for tid, topic in enumerate(topics_dist): print('Topic' + str(tid) + ':' + '\n', topic) # 文档的主题分布(mllib不能,ml才可以) # doc_topic = lda_model sc.stop()
def test(): sc = SparkContext(master='local[4]', appName='lda') sc.setLogLevel('ERROR') def train(): data = sc.textFile(corpus_filename).map(lambda line: Vectors.dense([float(i) for i in line.strip().split()])) corpus = data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() # print(corpus.take(5)) lda_model = LDA.train(rdd=corpus, maxIterations=max_iter, seed=seed, checkpointInterval=checkin_point_interval, k=K, optimizer=optimizer, docConcentration=alpha, topicConcentration=beta) if os.path.exists('./ldamodel'): __import__('shutil').rmtree('./ldamodel') lda_model.save(sc, "./ldamodel") # train() lda_model = LDAModel.load(sc, "./ldamodel") # topic-word分布(未归一化的dist,每列代表一个topic) topics = lda_model.topicsMatrix() # for tid in range(3): # print('Topic' + str(tid) + ':') # for wid in range(0, lda_model.vocabSize()): # print(' ' + str(topics[wid, tid] / sum(topics[:, tid]))) # 加一个归一化 # # print(' ' + str(topics[wid, tid])) # topic-word按词序排列分布([词id,按权重从大到小排列], [词在主题上的权重]) topics_dist = lda_model.describeTopics() for tid, topic in enumerate(topics_dist): print('Topic' + str(tid) + ':' + '\n', topic) # 文档的主题分布(mllib不能,ml才可以) # doc_topic = lda_model sc.stop()
from pyspark import SparkContext from pyspark.mllib.clustering import LDA, LDAModel from pyspark.mllib.linalg import Vectors # Load and parse the data sc = SparkContext(appName="lda") # data = sc.textFile("data/sample_lda_data.txt") # parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')])) # # Index documents with unique IDs # corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() # # # Cluster the documents into three topics using LDA # ldaModel = LDA.train(corpus, k=3) ldaModel = LDAModel.load(sc, "data/model/mymodel") # Output topics. Each is a distribution over words (matching word count vectors) print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):") topics = ldaModel.topicsMatrix() for topic in range(3): print("Topic " + str(topic) + ":") for word in range(0, ldaModel.vocabSize()): print(" " + str(topics[word][topic])) # Save and load model # ldaModel.save(sc, "data/model/mymodel") # sameModel = LDAModel.load(sc, "data/model/mymodel")
corpus = sc.parallelize(data_features) ldaModel = LDA.train(corpus, k=3) # Output topics. Each is a distribution over words (matching word count vectors) print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):") topics = ldaModel.topicsMatrix() for topic in range(3): print("Topic " + str(topic) + ":") for word in range(0, ldaModel.vocabSize()): print(" " + str(topics[word][topic])) # Save and load model model.save(sc, "myModelPath") sameModel = LDAModel.load(sc, "myModelPath") ldaModel = LDA.train(corpus, k=3) # Output topics. Each is a distribution over words (matching word count vectors) print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):") topics = ldaModel.topicsMatrix() for topic in range(3): print("Topic " + str(topic) + ":") for word in range(0, ldaModel.vocabSize()): print(" " + str(topics[word][topic])) # Save and load model model.save(sc, "myModelPath") sameModel = LDAModel.load(sc, "myModelPath")
import sys from pyspark.mllib.clustering import LDA, LDAModel from pyspark.mllib.linalg import Vectors from pyspark import SparkContext sc = SparkContext() for route, directories, files in os.walk('/media/deepak/data_words.csv'): for file in files: f_path = os.path.join(route, file) f_name = os.path.join(route, file).split('/') # Load and parse the data data = sc.textFile(f_path) dataParsed = data.map(lambda line: Vectors.dense( [float(x) for x in line.strip().split(',')])) # Index documents with unique IDs corpus = dataParsed.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() # Cluster the documents into three topics using LDA ldaModel = LDA.train(corpus, k=3) # Output topics. Each is a distribution over words (matching word count vectors) print(f_name[-1]) print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):") topics = ldaModel.topicsMatrix() for topic in range(3): print("Topic " + str(topic) + ":") for word in range(0, ldaModel.vocabSize()): print(" " + str(topics[word][topic])) # Save and load model ldaModel.save(sc, "/media/deepak/lda_output/" + f_name[-1]) sameModel = LDAModel.load(sc, "/media/deepak/lda_output/" + f_name[-1])
sc =SparkContext() corpus = sc.parallelize(data_features) ldaModel = LDA.train(corpus, k=3) # Output topics. Each is a distribution over words (matching word count vectors) print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):") topics = ldaModel.topicsMatrix() for topic in range(3): print("Topic " + str(topic) + ":") for word in range(0, ldaModel.vocabSize()): print(" " + str(topics[word][topic])) # Save and load model model.save(sc, "myModelPath") sameModel = LDAModel.load(sc, "myModelPath") ldaModel = LDA.train(corpus, k=3) # Output topics. Each is a distribution over words (matching word count vectors) print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):") topics = ldaModel.topicsMatrix() for topic in range(3): print("Topic " + str(topic) + ":") for word in range(0, ldaModel.vocabSize()): print(" " + str(topics[word][topic])) # Save and load model model.save(sc, "myModelPath") sameModel = LDAModel.load(sc, "myModelPath")
def load_model(): from pyspark.mllib.clustering import LDA, LDAModel #sc = SparkContext(appName='lda_load', conf=conf) path = "/user/rmusters/ldaModel2" ldaModel = LDAModel.load(sc, path) return ldaModel