def LDA_spark():
	data = sc.textFile("data/mllib/sample_lda_data.txt")
	parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
	corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

	# Cluster the documents into three topics using LDA
	Model = LDA.train(corpus, k=3)
			
	# Save and load model
	Model.save(sc, "myModelPath")
	sameModel = LDAModel.load(sc, "myModelPath")
Esempio n. 2
0
def test():
    sc = SparkContext(master='local[4]', appName='lda')
    sc.setLogLevel('ERROR')

    def train():
        data = sc.textFile(corpus_filename).map(lambda line: Vectors.dense(
            [float(i) for i in line.strip().split()]))
        corpus = data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
        # print(corpus.take(5))

        lda_model = LDA.train(rdd=corpus,
                              maxIterations=max_iter,
                              seed=seed,
                              checkpointInterval=checkin_point_interval,
                              k=K,
                              optimizer=optimizer,
                              docConcentration=alpha,
                              topicConcentration=beta)
        if os.path.exists('./ldamodel'):
            __import__('shutil').rmtree('./ldamodel')
        lda_model.save(sc, "./ldamodel")

    # train()

    lda_model = LDAModel.load(sc, "./ldamodel")

    # topic-word分布(未归一化的dist,每列代表一个topic)
    topics = lda_model.topicsMatrix()
    # for tid in range(3):
    #     print('Topic' + str(tid) + ':')
    #     for wid in range(0, lda_model.vocabSize()):
    #         print(' ' + str(topics[wid, tid] / sum(topics[:, tid])))  # 加一个归一化
    #         # print(' ' + str(topics[wid, tid]))

    # topic-word按词序排列分布([词id,按权重从大到小排列], [词在主题上的权重])
    topics_dist = lda_model.describeTopics()
    for tid, topic in enumerate(topics_dist):
        print('Topic' + str(tid) + ':' + '\n', topic)

    # 文档的主题分布(mllib不能,ml才可以)
    # doc_topic = lda_model

    sc.stop()
Esempio n. 3
0
def test():
    sc = SparkContext(master='local[4]', appName='lda')
    sc.setLogLevel('ERROR')

    def train():
        data = sc.textFile(corpus_filename).map(lambda line: Vectors.dense([float(i) for i in line.strip().split()]))
        corpus = data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
        # print(corpus.take(5))

        lda_model = LDA.train(rdd=corpus, maxIterations=max_iter, seed=seed, checkpointInterval=checkin_point_interval,
                              k=K,
                              optimizer=optimizer, docConcentration=alpha, topicConcentration=beta)
        if os.path.exists('./ldamodel'): __import__('shutil').rmtree('./ldamodel')
        lda_model.save(sc, "./ldamodel")

    # train()

    lda_model = LDAModel.load(sc, "./ldamodel")

    # topic-word分布(未归一化的dist,每列代表一个topic)
    topics = lda_model.topicsMatrix()
    # for tid in range(3):
    #     print('Topic' + str(tid) + ':')
    #     for wid in range(0, lda_model.vocabSize()):
    #         print(' ' + str(topics[wid, tid] / sum(topics[:, tid])))  # 加一个归一化
    #         # print(' ' + str(topics[wid, tid]))

    # topic-word按词序排列分布([词id,按权重从大到小排列], [词在主题上的权重])
    topics_dist = lda_model.describeTopics()
    for tid, topic in enumerate(topics_dist):
        print('Topic' + str(tid) + ':' + '\n', topic)

    # 文档的主题分布(mllib不能,ml才可以)
    # doc_topic = lda_model

    sc.stop()
Esempio n. 4
0
from pyspark import SparkContext

from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors

# Load and parse the data
sc = SparkContext(appName="lda")
# data = sc.textFile("data/sample_lda_data.txt")
# parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
# # Index documents with unique IDs
# corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
#
# # Cluster the documents into three topics using LDA
# ldaModel = LDA.train(corpus, k=3)

ldaModel = LDAModel.load(sc, "data/model/mymodel")


# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(3):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))

# Save and load model
# ldaModel.save(sc, "data/model/mymodel")

# sameModel = LDAModel.load(sc, "data/model/mymodel")
Esempio n. 5
0
corpus = sc.parallelize(data_features)

ldaModel = LDA.train(corpus, k=3)

# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " +
      str(ldaModel.vocabSize()) + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(3):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))

# Save and load model
model.save(sc, "myModelPath")
sameModel = LDAModel.load(sc, "myModelPath")

ldaModel = LDA.train(corpus, k=3)

# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " +
      str(ldaModel.vocabSize()) + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(3):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))

# Save and load model
model.save(sc, "myModelPath")
sameModel = LDAModel.load(sc, "myModelPath")
Esempio n. 6
0
import sys
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext

sc = SparkContext()
for route, directories, files in os.walk('/media/deepak/data_words.csv'):
    for file in files:
        f_path = os.path.join(route, file)
        f_name = os.path.join(route, file).split('/')
        # Load and parse the data
        data = sc.textFile(f_path)
        dataParsed = data.map(lambda line: Vectors.dense(
            [float(x) for x in line.strip().split(',')]))
        # Index documents with unique IDs
        corpus = dataParsed.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
        # Cluster the documents into three topics using LDA
        ldaModel = LDA.train(corpus, k=3)
        # Output topics. Each is a distribution over words (matching word count vectors)
        print(f_name[-1])
        print("Learned topics (as distributions over vocab of " +
              str(ldaModel.vocabSize()) + " words):")
        topics = ldaModel.topicsMatrix()
        for topic in range(3):
            print("Topic " + str(topic) + ":")
            for word in range(0, ldaModel.vocabSize()):
                print(" " + str(topics[word][topic]))
        # Save and load model
        ldaModel.save(sc, "/media/deepak/lda_output/" + f_name[-1])
        sameModel = LDAModel.load(sc, "/media/deepak/lda_output/" + f_name[-1])
sc =SparkContext()
corpus = sc.parallelize(data_features)

ldaModel = LDA.train(corpus, k=3)

# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(3):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))
    
# Save and load model
model.save(sc, "myModelPath")
sameModel = LDAModel.load(sc, "myModelPath")

ldaModel = LDA.train(corpus, k=3)

# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(3):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))
    
# Save and load model
model.save(sc, "myModelPath")
sameModel = LDAModel.load(sc, "myModelPath")
def load_model():
    from pyspark.mllib.clustering import LDA, LDAModel
    #sc = SparkContext(appName='lda_load', conf=conf)
    path = "/user/rmusters/ldaModel2"
    ldaModel = LDAModel.load(sc, path)
    return ldaModel