def latent_dirichlet_allocation(unclustered_data, number_of_clusters, max_iterations=20, doc_concentration=-1.0, topic_concentration=-1.0, seed=None, checkpoint_interval=10, optimizer='em'): if number_of_clusters < 1: raise ValueError("While clustering with LDA, \ the given number of clusters is not positive") parsedData = unclustered_data.map(lambda lst: Vectors.dense(lst)) corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() ldaModel = LDA.train(rdd=corpus, k=number_of_clusters, maxIterations=max_iterations, docConcentration=doc_concentration, topicConcentration=topic_concentration, seed=seed, checkpointInterval=checkpoint_interval, optimizer=optimizer) topics = ldaModel.topicsMatrix() return [ldaModel, topics]
def getKeywordsInDataRange(sDF, oldestTime, newestTime, topics=1, wordsPerTopic=20): #yyyy-MM-dd #Filter oldestTime = datetime.strptime(oldestTime, '%Y-%m-%d') newestTime = datetime.strptime(newestTime, '%Y-%m-%d') filteredText = sDF\ .select( "id", date_format('day','yyyy-MM-dd').alias('time'), col("title").alias("text") )\ .where( (col("time") >= oldestTime) & (col("time") <= newestTime) ) #StartPipeline for preparing data textToWords = RegexTokenizer( inputCol="text", outputCol="splitted", pattern="[\\P{L}]+") #Remove signs and split by spaces stopRemover = StopWordsRemover( inputCol="splitted", outputCol="words", stopWords=StopWordsRemover.loadDefaultStopWords("english")) countVectorizer = CountVectorizer(inputCol="words", outputCol="features") pipeline = Pipeline(stages=[textToWords, stopRemover, countVectorizer]) #GetCorups for LDA try: model = pipeline.fit(filteredText) except IllegalArgumentException: return [] result = model.transform(filteredText) corpus = result.select("id", "features").rdd.map( lambda r: [mhash(r.id) % 10**8, Vectors.fromML(r.features)]).cache() # Cluster the documents into k topics using LDA ldaModel = LDA.train(corpus, k=topics, maxIterations=100, optimizer='online') topics = ldaModel.topicsMatrix() vocabArray = model.stages[2].vocabulary #CountVectorizer topicIndices = spark.sparkContext.parallelize( ldaModel.describeTopics(maxTermsPerTopic=wordsPerTopic)) def topic_render(topic): # specify vector id of words to actual words terms = topic[0] result = [] for i in range(wordsPerTopic): term = vocabArray[terms[i]] result.append(term) return result # topics_final = topicIndices.map(lambda topic: topic_render(topic)).collect() # for topic in range(len(topics_final)): # print ("Topic" + str(topic) + ":") # for term in topics_final[topic]: # print (term) # print ('\n') return topicIndices.map(lambda topic: topic_render(topic)).collect()
def runLDA(data,NumberOfTopics): #print("\n\n\n%") #print(data) data = sc.parallelize(data) #print("$") #print(data) #print("\n\n\n") model = LDA.train(data,k=NumberOfTopics,seed=1) return model
def main(*x, **r): l = x dataset = "hdfs://" + r['ip'] + "/user/" + r['user'] + "/In/" + r['file'] sc = r['sprkcontext'] base = os.path.abspath(os.path.dirname(__file__)) path = os.path.join(base, 'tuned', r['file'], str(r['label'])) start_time = time.time() if not os.path.exists(path): os.makedirs(path) b = int(l[0]) path1 = path + "/K_" + str(b) + "_a_" + str(l[1]) + "_b_" + str( l[2]) + ".txt" with open(path1, "w") as f: f.truncate() fo = open(path1, 'w+') score_topic = [] corpus, vocabArray = preprocess(sc, path=dataset, vocabsize=50000, stopwordfile='') #corpus.cache() for i in range(10): fo.write("Run : " + str(i) + "\n") x = corpus.collect() shuffle(x) corpus = sc.parallelize(x) #corpus.cache() ldaModel = LDA.train(corpus, k=int(l[0]), maxIterations=20, docConcentration=float(l[1]), topicConcentration=float(l[1]), checkpointInterval=10, optimizer='online') # println(s"\t $distLDAModel.topicsMatrix().toArray()") topicIndices = ldaModel.describeTopics(maxTermsPerTopic=10) topics = [] for x in topicIndices: topics.append( zip(list(map(lambda a: str(vocabArray[int(a)]), x[0])), x[1])) for a in range(len(topics)): fo.write("Topic " + str(a) + ": ") str1 = '' for b in topics[a]: str1 += b[0] + " " fo.write(b[0] + " ") score_topic.append(str1) fo.write("\n") fo.write("\n") b = jaccard(int(l[0]), tops=score_topic, term=r['label']) fo.write("\nRuntime: --- %s seconds ---\n" % (time.time() - start_time)) fo.write("\nScore: " + str(b)) fo.close() return b
def train(): data = sc.textFile(corpus_filename).map(lambda line: Vectors.dense([float(i) for i in line.strip().split()])) corpus = data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() # print(corpus.take(5)) lda_model = LDA.train(rdd=corpus, maxIterations=max_iter, seed=seed, checkpointInterval=checkin_point_interval, k=K, optimizer=optimizer, docConcentration=alpha, topicConcentration=beta) if os.path.exists('./ldamodel'): __import__('shutil').rmtree('./ldamodel') lda_model.save(sc, "./ldamodel")
def lda_spark(sc, X=None, clusters=3): if X is None: X = users_as_parallelizable_sparse_data(users) X = sc.parallelize(X) X = X.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() ldaModel = LDA.train(X, k=clusters) topics = ldaModel.topicsMatrix() f, (ax1) = sns.plt.subplots(1, sharex=False, sharey=False) f.suptitle("Results of running LDA on spark", fontsize=20) ax1.set_title("Heatmap over topics matrix") sns.heatmap(topics, ax=ax1)
def main(): for tn in tablenames: data = spark.read.format("org.apache.spark.sql.cassandra")\ .options(table=tn, keyspace=keyspace).load().limit(1000) data = data.sort('imdb_score', ascending=False) desc = data.rdd.map(lambda x: x['description']).filter( lambda x: x is not None) StopWords = nltk.corpus.stopwords.words('english') StopWords.extend([" ... See full summary"]) tokenized = desc.map( lambda y: y.strip().lower()).map( lambda x: re.split(" ", x))\ .map( lambda word: [x for x in word if x.isalpha()]).map( lambda word: [x for x in word if len(x) > 3] )\ .map( lambda word: [x for x in word if x not in StopWords]).zipWithIndex() df_txts = spark.createDataFrame(tokenized, ["words", 'index']) countVec = CountVectorizer(inputCol="words", outputCol="raw_features", vocabSize=5000, minDF=10.0) CountVectMod = countVec.fit(df_txts) result = CountVectMod.transform(df_txts) idf = IDF(inputCol="raw_features", outputCol="features") idfModel = idf.fit(result) resultTFIdf = idfModel.transform(result) totalTopics = 10 totalItr = 100 LDAModel = MLlibLDA.train(resultTFIdf.select('index','features').rdd.mapValues(MLlibVectors.fromML).map(list),\ k=totalTopics, maxIterations=totalItr) maxwordsTopic = 5 topicIndices = sc.parallelize( LDAModel.describeTopics(maxTermsPerTopic=5)) VCarr = CountVectMod.vocabulary def finalTopic(topic): terms = topic[0] result = [] for i in range(maxwordsTopic): term = VCarr[terms[i]] result.append(term) return result topics_final = topicIndices.map( lambda topic: finalTopic(topic)).collect() print(topics_final) for topic in range(len(topics_final)): print("Topic" + str(topic) + ":") for term in topics_final[topic]: print(term) print('\n')
def LDA_spark(): data = sc.textFile("data/mllib/sample_lda_data.txt") parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')])) corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() # Cluster the documents into three topics using LDA Model = LDA.train(corpus, k=3) # Save and load model Model.save(sc, "myModelPath") sameModel = LDAModel.load(sc, "myModelPath")
def train_model(): sc = SparkContext(appName='lda_train', conf=conf) from pyspark.sql import SQLContext sqlContext = SQLContext(sc) model_vectors = sqlContext.read.parquet( '/user/rmusters/jan_threshold20_2015model99/data') logger.info("model loaded") rdd_words = model_vectors.map(lambda line: line[0]) words = rdd_words.collect() #15919 logger.info("Amount of words collected: %i", len(words)) path = 'hdfs:///user/rmusters/data_jan_sample' data = sqlContext.read.parquet(path) # logger.info("data loaded") # data = data.sample(False, 0.01) # logger.info("data sampled") def bow(filtered_text): word_dict = {} vector_dict = {} for i, v in enumerate(words): word_dict[v] = i vector_dict[i] = 0 for w in filtered_text: if w in words: vector_dict[word_dict[w]] = vector_dict[word_dict[w]] + 1 return vector_dict #check if sum of vector is zero 13 times. This indicates the datasample does not contain certain words and thus the sparse vector removes them from pyspark.mllib.linalg import SparseVector size = len(words) logger.info("size of words: %i", size) #bag of words is used to train LDA data = data.map(lambda (text, filtered_text, id): ( text, filtered_text, SparseVector(size, bow(filtered_text)), id)) logger.info("bag of words data") df = data.toDF(["text", "filtered_text", "vectors", "id"]) df.write.parquet("hdfs:///user/rmusters/lda_data_jan", mode="overwrite") corpus = data.map(lambda (text, filtered_text, vector, id): [id, vector]) logger.info("Training the lda model") ldaModel = LDA.train(corpus, k=500) logger.info("Vocabsize is: %i", ldaModel.vocabSize()) ldaModel.save(sc, 'hdfs:///user/rmusters/ldaModel_jan') logger.info("model saved")
def runOnlineLDA(data, numOfTags, K = 10): ''' require preprocessed input data input format: (key, [values]) All values would be equally weighted. ''' corpus = data.map(lambda (key, values): list(values)) \ .filter(lambda p: len(p) >= 2) \ .zipWithIndex().map(lambda (p, index): (index + 1, p)) \ .mapValues(lambda p: SparseVector(numOfTags, {val: 1.0 for val in p})) \ .map(lambda (index, values): [index, values]).cache() return LDA.train(corpus, K, optimizer='online')
def LDAThis(sc, RDD, minFreq, numTopics, maxIter, wordsPerTopic): ''' Arguments: sc: A SparkContext Object RDD: An RDD with rows as tokenized sentences minFreq: Minimum document frequency for CountVectorizer numTopics: Number of Topics maxIter: Max number of iterations for LDA train wordsPerTopic: Number of words to show per topic topWords: Number of words to show per topic Requirements sqlContext = SQLContext(sc) <- must be defined outside function ''' StopWords = stopwords.words("english") sqlContext = SQLContext(sc) # Structure Data idRDD = RDD.map( lambda words: [x for x in words if x.isalpha() and x not in StopWords ]).filter(lambda x: len(x) > 2).zipWithIndex() idDF = sqlContext.createDataFrame(idRDD, ["tokens", 'index']) # Term Frequency CVecModel = CountVectorizer(inputCol="tokens", outputCol="rawFeatures", vocabSize=5000, minDF=minFreq).fit(idDF) resultCVec = CVecModel.transform(idDF) vocabArray = CVecModel.vocabulary #IDF idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(resultCVec) resultTFIDF = idfModel.transform(resultCVec) # LDA resultLDA = LDA.train(resultTFIDF.select( 'index', 'features').rdd.mapValues(Vectors.fromML).map(list), k=numTopics, maxIterations=maxIter) topicIndices = sc.parallelize( resultLDA.describeTopics(maxTermsPerTopic=wordsPerTopic)) topicsFinal = topicIndices.map(lambda topic: render_topics( topic, wordsPerTopic, vocabArray)).collect() # Show Topics for topic in range(len(topicsFinal)): print("Topic" + str(topic) + ":") for term in topicsFinal[topic]: print(term) print('\n') return resultLDA
def runLDA(filepath, n): data = sc.textFile(filepath) n_vcb = int(data.take(2)[1]) parsedData = data.map(lambda line: line.strip().split(' ')).filter( lambda x: len(x) > 2).map(lambda x: (int(x[0]) - 1, (int(x[ 1]) - 1, float(x[2])))).groupByKey().mapValues(list) corpus = parsedData.map( lambda x: [x[0], Vectors.sparse(n_vcb, x[1])]).cache() ldaModel = LDA.train(corpus, k=n) print "Learned topics (as distributions over vocab of " + str( ldaModel.vocabSize()) + " words):" print ldaModel.describeTopics(maxTermsPerTopic=20) return ldaModel.topicsMatrix()
def A1(): #1) apply LDA and find topics in user's posts (including reposts) textToWords = RegexTokenizer( inputCol="text", outputCol="splitted", pattern="[\\P{L}]+") #Remove signs and split by spaces stopRemover = StopWordsRemover( inputCol="splitted", outputCol="words", stopWords=StopWordsRemover.loadDefaultStopWords("russian") + StopWordsRemover.loadDefaultStopWords("english")) countVectorizer = CountVectorizer(inputCol="words", outputCol="features") #Filter if post id exists? data = uWallP\ .filter( uWallP.text != "" )\ .select("id","text")\ .limit(10)\ pipeline = Pipeline(stages=[textToWords, stopRemover, countVectorizer]) model = pipeline.fit(data) result = model.transform(data) corpus = result.select("id", "features").rdd.map( lambda r: [r.id, Vectors.fromML(r.features)]).cache() # Cluster the documents into k topics using LDA ldaModel = LDA.train(corpus, k=8, maxIterations=100, optimizer='online') topics = ldaModel.topicsMatrix() vocabArray = model.stages[2].vocabulary #CountVectorizer wordNumbers = 20 # number of words per topic topicIndices = spark.sparkContext.parallelize( ldaModel.describeTopics(maxTermsPerTopic=wordNumbers)) def topic_render(topic): # specify vector id of words to actual words terms = topic[0] result = [] for i in range(wordNumbers): term = vocabArray[terms[i]] result.append(term) return result topics_final = topicIndices.map( lambda topic: topic_render(topic)).collect() for topic in range(len(topics_final)): print("Topic" + str(topic) + ":") for term in topics_final[topic]: print(term) print('\n')
def word_topics(num_topics=NUM_TOPICS, num_words_per_topics=NUM_WORDS_PER_TOPICS): """Generates topics from word clusters. Arguments: num_topics {integer} -- Number of topics to infer num_words_per_topics {integer} -- Number of terms to collect for each topic Returns: None """ spark = init_spark(AITA_CLEANED_COLLECTION) data_rdd = spark.read.format('mongo').load().rdd preprocessed_rdd = data_rdd\ .flatMap(lambda row: [row['header'].lower().split(' ') + row['content'].lower().split(' ')]) \ .zipWithIndex() \ .map(lambda x: Row(index=x[1], words=x[0])) preprocessed_df = spark.createDataFrame(preprocessed_rdd) cv = CountVectorizer(inputCol='words', outputCol='vectors') model = cv.fit(preprocessed_df) vector_df = model.transform(preprocessed_df) corpus = vector_df.select('index', 'vectors').rdd.map(lambda x: [x[0], Vectors.fromML(x[1])]).cache() lda_model = LDA.train(corpus, k=num_topics, maxIterations=100, optimizer='online') vocab_array = model.vocabulary topic_indices = spark.sparkContext.parallelize(lda_model.describeTopics(maxTermsPerTopic=num_words_per_topics)) def vector_id_to_word(topic): terms = topic[0] weights = topic[1] result = [] for i in range(num_words_per_topics): result.append((vocab_array[terms[i]], weights[i])) return result topics = topic_indices.map(lambda topic: vector_id_to_word(topic)).collect() for i in range(len(topics)): print('Topic {}:'.format(i)) for item in topics[i]: print(item) print('\n')
def train(): data = sc.textFile(corpus_filename).map(lambda line: Vectors.dense( [float(i) for i in line.strip().split()])) corpus = data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() # print(corpus.take(5)) lda_model = LDA.train(rdd=corpus, maxIterations=max_iter, seed=seed, checkpointInterval=checkin_point_interval, k=K, optimizer=optimizer, docConcentration=alpha, topicConcentration=beta) if os.path.exists('./ldamodel'): __import__('shutil').rmtree('./ldamodel') lda_model.save(sc, "./ldamodel")
def main(): p = sys.argv[1] logFile = "data/" + p + "_cleaned.txt" sc = SparkContext("local", "simpleApp") data = sc.textFile(logFile).cache() numberoftweets = data.count() words = data.flatMap(lambda x: x.split(" ")).distinct() word_list = words.collect() words = words.flatMap(lambda x:d(x,numberoftweets)) data = data.zipWithIndex().map(lambda (x,y): (y,x.split(" "))) wc = data.flatMap(lambda x: func(x)).groupByKey().mapValues(lambda x: len(x)) mat = words.leftOuterJoin(wc).map(lambda (x,y): (x[0],(x[1], f(y[0],y[1])))).groupByKey().sortByKey().mapValues(lambda x:list(x)).mapValues(lambda x: ok(x)) parsedData = mat.mapValues(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')])).map(lambda (x,y): [x,y]) # Index documents with unique IDs corpus = parsedData.cache() # Cluster the documents into three topics using LDA ldaModel = LDA.train(corpus, k=3) # Output topics. Each is a distribution over words (matching word count vectors) topics = ldaModel.topicsMatrix() topics_dict={} for topic in range(3): k = "Topic "+ str(topic) topics_dict[k] = {} for word in range(0, ldaModel.vocabSize()): topics_dict[k][str(topics[word][topic])] = word_list[word] # path = "data/" + p + "_results.txt" # json = open(path, 'wb') # from chardet import detect # encoding = lambda x: detect(x)['encoding'] for i in topics_dict.keys(): counter=0 z = sorted(topics_dict[i],reverse=True) for l in z: if counter == 7: break line = topics_dict[i][l] + " " counter+=1 string_for_output = line.encode('utf8', 'replace') print(line)
def main(): # 初始化 SparkContext sc = spark_context(spark_master) # 加载数据 data = sc.textFile(hdfs_path) # 计算词频 documents = data.map(tokenize) hashingTF = HashingTF(2 << 10) tf = hashingTF.transform(documents) # 对文档词频进行索引 corpus = tf.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() # 索引和词的映射 mapping = hashing_term_mapping(documents) mapping.cache() # 训练 LDA 模型 ldaModel = LDA.train(corpus, k=3) # 链接到 MongoDB from pymongo import MongoClient mongo_client = MongoClient(mongo_host) mongo_client.admin.authenticate(mongo_user, mongo_pass, mechanism='SCRAM-SHA-1') clear_mongodb(mongo_client) # 保存结果到 MongoDB topics = ldaModel.describeTopics(maxTermsPerTopic=10) for topic in range(3): doc = {} doc['name'] = "topic " + str(topic) doc['terms'] = [] for i in range(10): term_index = topics[topic][0][i] for term in mapping.lookup(term_index): doc['terms'].append([term.encode("utf8"), topics[topic][1][i]]) send_mongodb(mongo_client, doc)
def getCellLDA(Documents, NumberOfTopics): #print("\n\n\n/////////////////////////////////////////////////////////////////////////////////////") #print(Documents.collect())"""""" corpus = sc.parallelize(Documents) term_counts = corpus.flatMap(lambda x: x).map(lambda x: (x,1)).reduceByKey(add) #print(term_counts.collect()) vocabulary = term_counts.map(lambda x: x[0]).zipWithIndex().collectAsMap() #print("_____________________________________________________________________________________") #print(vocabulary) #print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") documents = corpus.zipWithIndex().map(lambda doc: documentToSparseVector(doc,vocabulary)).map(list) #print(documents.collect()) #print("*************************************************************************************") lda = LDA.train(documents, k=NumberOfTopics, seed=1) #print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") #print(result) #print("|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||") return lda, vocabulary
def LDA_Treatment(str): finalTopics = [] txt = wordTokenize(str) data = sc.parallelize([txt]).zipWithIndex().map(lambda val: Row(idd=val[1], _words=val[0].split(" "))) docDF = spark.createDataFrame(data, ["_words"]) Vector = CountVectorizer(inputCol="_words", outputCol="vectors") model = Vector.fit(docDF) result = model.transform(docDF) corpus = result.select("idd", "vectors").rdd.map(lambda val: [val[0], Vectors.fromML(val[1])]).cache() ldaModel = LDA.train(corpus, k=nbTopics, maxIterations=1000, optimizer='online') topics = ldaModel.topicsMatrix() vocabArray = model.vocabulary topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic=wordNumbers)) topics_final = topicIndices.map(lambda topic: topic_render(topic, vocabArray)).collect() for topic in range(len(topics_final)): for term in topics_final[topic]: term = unidecode.unidecode(term) finalTopics.append(term) return finalTopics
def main(): # 初始化 SparkContext sc = spark_context(spark_master) # 加载数据 data = sc.textFile(hdfs_path) # 计算词频 documents = data.map(tokenize) hashingTF = HashingTF(2 << 10) tf = hashingTF.transform(documents) # 对文档词频进行索引 corpus = tf.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() # 索引和词的映射 mapping = hashing_term_mapping(documents) mapping.cache() # 训练 LDA 模型 ldaModel = LDA.train(corpus, k=3) # 链接到 MongoDB from pymongo import MongoClient mongo_client = MongoClient(mongo_host) mongo_client.admin.authenticate(mongo_user, mongo_pass, mechanism="SCRAM-SHA-1") clear_mongodb(mongo_client) # 保存结果到 MongoDB topics = ldaModel.describeTopics(maxTermsPerTopic=10) for topic in range(3): doc = {} doc["name"] = "topic " + str(topic) doc["terms"] = [] for i in range(10): term_index = topics[topic][0][i] for term in mapping.lookup(term_index): doc["terms"].append([term.encode("utf8"), topics[topic][1][i]]) send_mongodb(mongo_client, doc)
def main(): p = sys.argv[1] logFile = "data/" + p + "_cleaned.txt" sc = SparkContext("local", "simpleApp") sqlContext = SQLContext(sc) data = sc.textFile(logFile).zipWithIndex().map(lambda (words,idd): Row(idd= idd, words = words.split(" "))).cache() docDF = sqlContext.createDataFrame(data) Vector = CountVectorizer(inputCol="words", outputCol="vectors") model = Vector.fit(docDF) result = model.transform(docDF) corpus_size = result.count() corpus = result.select("idd", "vectors").map(lambda (x,y): [x,y]).cache() # Cluster the documents into three topics using LDA ldaModel = LDA.train(corpus, k=3,maxIterations=100,optimizer='online') topics = ldaModel.topicsMatrix() wordNumbers = 10 topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic = wordNumbers)) vocabArray = model.vocabulary topics_final = topicIndices.map(lambda topic: topic_render(topic,wordNumbers,vocabArray)).collect() path = "data/" + p + "_results.txt" json = open(path, 'wb') json.close() for topic in topics_final: for term in topic: line = term[0] + " " try: string_for_output = line.encode('utf8', 'replace') if string_for_output != " ": os.system("python3 basic/codes/p3p.py " + string_for_output + " >> " + path) except: pass os.system("python3 basic/codes/p3p.py " + "delmch" + " >> " + path)
def getCellLDA(Documents, NumberOfTopics, OnlineOptimizer): #print("\n\n\n/////////////////////////////////////////////////////////////////////////////////////") #print("F**K OFF") #print("docs:") #print(Documents.collect()) corpus = Documents #.flatMap(lambda x: x[1])#sc.parallelize(Documents) #print("corpus:") #print(corpus.collect()) term_counts = corpus.flatMap(lambda x: x).map( lambda x: (x, 1)).reduceByKey(lambda x, y: x + y) #term_counts.cache() #print("term_counts:") #print(term_counts) #print(term_counts.collect()) #print(term_counts.collect()) #print("vocabulary:") vocabulary = term_counts.map(lambda x: x[0]).zipWithIndex().collectAsMap() #print(vocabulary) #print("______________________________________________________________________________________\n\n") #print("_____________________________________________________________________________________") #print(vocabulary) #print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") documents = corpus.zipWithIndex().map( lambda doc: documentToSparseVector(doc, vocabulary)).map(list) #print(documents.collect()) #print("*************************************************************************************") Optimizer = "online" if OnlineOptimizer else "em" lda = LDA.train(documents, k=NumberOfTopics, maxIterations=20, optimizer=Optimizer) #print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") #print(result) #print("|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||") return lda, vocabulary #[lda.describeTopics(), vocabulary]
from pyspark.sql import SQLContext, Row sc = SparkContext() # input file is a term-document matrix, which is generated by make_tdm.py data = sc.textFile( "/Users/Zhen/Desktop/Courses/BigData/stackexchange/topicModeling/result/matrix.csv" ) header = data.first() #extract header data = data.filter(lambda x: x != header) data = data.map( lambda line: Vectors.dense([float(x) for x in line.strip().split(',')])) # Index documents with unique IDs corpus = data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() # Cluster the documents into k topics using LDA ldaModel = LDA.train(corpus, k=30) # Output topics. Each is a distribution over words (matching word count vectors) print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):") topics = ldaModel.topicsMatrix() # for topic in range(3): # print("Topic " + str(topic) + ":") # for word in range(0, ldaModel.vocabSize()): # print(" " + str(topics[word])) import numpy numpy.savetxt( "/Users/Zhen/Desktop/Courses/BigData/stackexchange/topicModeling/result/lda_topicMatrix.csv", topics, delimiter=",")
score_topic = [] for i in range(10): fo.write("Run : " + str(i) + "\n") ''''shuffle(l) x=sc.parallelize(l).map(lambda d:(d,0)) corpus = corpus.map(lambda e: (e[0],e[1])) print(x.collect()) corpus=corpus.join(x).map(lambda e: [e[0],e[1][0]] )''' x = corpus.collect() shuffle(x) corpus = sc.parallelize(x) ldaModel = LDA.train(corpus, k=10, maxIterations=20, docConcentration=-1.0, topicConcentration=-1.0, checkpointInterval=10, optimizer='online') topicIndices = ldaModel.describeTopics(maxTermsPerTopic=10) topics = [] for x in topicIndices: topics.append( zip(list(map(lambda a: str(vocabArray[int(a)]), x[0])), x[1])) for a in range(len(topics)): fo.write("Topic " + str(a) + ": ") str1 = '' for b in topics[a]: str1 += b[0] + " " fo.write(b[0] + " ")
# Load and parse the data #data = to_list[0:1000] from pyspark.sql.types import StringType from pyspark.sql.functions import * corpus_df = spark.createDataFrame(data, StringType()) corpus_df = corpus_df.withColumn("index",monotonically_increasing_id()) corpus_df = corpus_df.withColumn("arrayColumn", array("value")) #data = sc.textFile(path).zipWithIndex().map(lambda (words,idd): Row(idd= idd, words = words.split(" "))) #docDF = spark.createDataFrame(data) Vector = CountVectorizer(inputCol="arrayColumn", outputCol="vectors") model = Vector.fit(corpus_df) result = model.transform(corpus_df) # Cluster the documents into three topics using LDA ldaModel = LDA.train(data, k=10) #ldaModel.save(sc, "target/org/apache/spark/PythonLatentDirichletAllocationExample/LDAModel") #sameModel = LDAModel\ # .load(sc, "target/org/apache/spark/PythonLatentDirichletAllocationExample/LDAModel") # COMMAND ---------- num_topics = 10 max_iterations = 100 lda_model = LDA.train(result[['index','vectors']], k=num_topics, maxIterations=max_iterations) # COMMAND ---------- result.show(truncate=False)
counts[token_id] += 1 counts = sorted(counts.items()) keys = [x[0] for x in counts] values = [x[1] for x in counts] return (id, Vectors.sparse(len(vocabulary), keys, values)) # Process all of the documents into word vectors using the # `document_vector` function defined previously documents = tokens.zipWithIndex().map(document_vector).map(list) # Get an inverted vocabulary, so we can look up the word by it's index value inv_voc = {value: key for (key, value) in vocabulary.items()} # Open an output file with open("new_output.txt", 'w') as f: lda_model = LDA.train(documents, k=num_topics, maxIterations=max_iterations) topic_indices = lda_model.describeTopics( maxTermsPerTopic=num_words_per_topic) # Print topics, showing the top-weighted 10 terms for each topic for i in range(len(topic_indices)): f.write("Topic #{0}\n".format(i + 1)) for j in range(len(topic_indices[i][0])): f.write("{0}\t{1}\n".format(inv_voc[topic_indices[i][0][j]] \ .encode('utf-8'), topic_indices[i][1][j])) f.write("{0} topics distributed over {1} documents and {2} unique words\n" \ .format(num_topics, documents.count(), len(vocabulary)))
for file_id in range(1,kFileNum): YOUR_FILE = "wet_data/CC-MAIN-20150728002301-%05d-ip-10-236-191-2.ec2.internal.warc.wet"%file_id YOUR_DELIMITER = "WARC/1.0" text_file= sc.newAPIHadoopFile(YOUR_FILE,"org.apache.hadoop.mapreduce.lib.input.TextInputFormat", "org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text", conf = {"textinputformat.record.delimiter":YOUR_DELIMITER}).map(lambda l:l[1]) file_words = text_file.map(lambda file:file.replace('\n',' ')) current_corpus = file_words.map(gen_vectors) if total_corpus == None: total_corpus = current_corpus else: total_corpus = total_corpus.union(current_corpus) total_corpus = total_corpus.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() kNumTopics = 10 # Cluster the documents into three topics using LDA ldaModel = LDA.train(total_corpus, k=kNumTopics) # Output topics. Each is a distribution over words (matching word count vectors) print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):") topics = ldaModel.topicsMatrix() topic_words = [] for topic in range(kNumTopics): word_weight = [] print("Topic " + str(topic) + ":") for word in range(0, ldaModel.vocabSize()): #print(" " + str(topics[word][topic])) word_weight.append((word,topics[word][topic])) sorted_word_weight = sorted(word_weight,key = lambda x:-x[1]) print sorted_word_weight
#Create Tweet ID from pyspark.sql.functions import monotonically_increasing_id df = df.withColumn("tweet_id", monotonically_increasing_id()) df.show() #Count Vectorizer from pyspark.ml.feature import CountVectorizer cv = CountVectorizer(inputCol='ngrams', outputCol='features', vocabSize=100000, minDF=2) cvmodel = cv.fit(df) result = cvmodel.transform(df) result.show() from pyspark.mllib.linalg import Vectors as MLlibVectors from pyspark.mllib.clustering import LDA as MLlibLDA #Train the LDA model model = MLlibLDA.train(result.select("tweet_id", "features").rdd.mapValues( MLlibVectors.fromML).map(list), k=3) #Show Topics and weights topics = model.describeTopics(maxTermsPerTopic=50) for x, topic in enumerate(topics): print('topic number: ' + str(x)) words = topic[0] weights = topic[1] for n in range(len(words)): print(cvmodel.vocabulary[words[n]] + ' ' + str(weights[n]))
# for tweet in tweets: # fd.write(tweet+'\n') rdd = sc.textFile('opinion.txt').zipWithIndex().map( lambda (words, idd): Row(idd=idd, words=words.split(" "))) docDF = spark.createDataFrame(rdd) Vector = CountVectorizer(inputCol="words", outputCol="vectors") model = Vector.fit(docDF) result = model.transform(docDF) corpus = result.select( "idd", "vectors").rdd.map(lambda (x, y): [x, Vectors.fromML(y)]).cache() # Cluster the documents into three topics using LDA ldaModel = LDA.train(corpus, k=5, maxIterations=100, optimizer='online') topics = ldaModel.topicsMatrix() vocabArray = model.vocabulary wordNumbers = 5 # number of words per topic topicIndices = sc.parallelize( ldaModel.describeTopics(maxTermsPerTopic=wordNumbers)) def topic_render(topic): # specify vector id of words to actual words terms = topic[0] result = [] for i in range(wordNumbers): term = vocabArray[terms[i]] result.append(term) return result
sqlContext = SQLContext(sc) path = ... # path of the txt file data = sc.textFile(path).zipWithIndex().map(lambda (words,idd): Row(idd= idd, words = words.split(" "))) docDF = sqlContext.createDataFrame(data) Vector = CountVectorizer(inputCol="words", outputCol="vectors") model = Vector.fit(docDF) result = model.transform(docDF) corpus_size = result.count() # total number of words corpus = result.select("idd", "vectors").map(lambda (x,y): [x,y]).cache() # Cluster the documents into three topics using LDA ldaModel = LDA.train(corpus, k=3,maxIterations=100,optimizer='online') topics = ldaModel.topicsMatrix() vocabArray = model.vocabulary wordNumbers = 10 # number of words per topic topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic = wordNumbers)) def topic_render(topic): # specify vector id of words to actual words terms = topic[0] result = [] for i in range(wordNumbers): term = vocabArray[terms[i]] result.append(term) return result topics_final = topicIndices.map(lambda topic: topic_render(topic)).collect()
df_comments = sqlContext.createDataFrame(comments, ["list_of_words", 'index']) # TF cv = CountVectorizer(inputCol="list_of_words", outputCol="raw_features", vocabSize=50000, minDF=10.0) cvmodel = cv.fit(df_comments) result_cv = cvmodel.transform(df_comments) # IDF idf = IDF(inputCol="raw_features", outputCol="features") idfModel = idf.fit(result_cv) result_tfidf = idfModel.transform(result_cv) lda = LDA(k=3, maxIter=50) model = lda.fit(result_tfidf[['index', 'features']]) transformed = model.transform(result_tfidf) # transformed.show(truncate=False) model.describeTopics(8).show() # ll = model.logLikelihood(result_tfidf[['index','features']]) # lp = model.logPerplexity(result_tfidf[['index','features']]) vocabulary = {} j = 0 for i in cvmodel.vocabulary: vocabulary[j] = i.encode("utf-8") j += 1
# TF cv = CountVectorizer(inputCol="list_of_words", outputCol="raw_features", vocabSize=5000, minDF=10.0) cvmodel = cv.fit(df_txts) result_cv = cvmodel.transform(df_txts) # IDF idf = IDF(inputCol="raw_features", outputCol="features") idfModel = idf.fit(result_cv) result_tfidf = idfModel.transform(result_cv) num_topics = 10 max_iterations = 100 lda_model = LDA.train(result_tfidf[['index', 'features']].map(list), k=num_topics, maxIterations=max_iterations) wordNumbers = 5 topicIndices = spark.parallelize(lda_model.describeTopics\ (maxTermsPerTopic = wordNumbers)) def topic_render(topic): terms = topic[0] result = [] for i in range(wordNumbers): term = vocabArray[terms[i]] result.append(term) return result
hashed_word = pd.DataFrame(hashed.collect(), columns=['hash','word']).set_index('hash') # hashingTF = HashingTF() # Tf-Idfの生成 tf = hashingTF.transform(documents) tf.cache() idf = IDF().fit(tf) tf_idf_data = idf.transform(tf) print dt.now().strftime('%Y/%m/%d %H:%M:%S') K = 5 # Index documents with unique IDs corpus_data = tf_idf_data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() print corpus_data # Cluster the documents into three topics using LDA ldaModel = LDA.train(corpus_data, k=K) # Output topics. Each is a distribution over words (matching word count vectors) print "Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):" topics = ldaModel.topicsMatrix() print dt.now().strftime('%Y/%m/%d %H:%M:%S') def idx_to_word(idx): res = hashed_word.ix[idx].word if type(res) == pd.Series: return res.to_dict().values()[0] else: return res rep_num = 20 for topic in range(K): print "Topic " + str(topic) + ":"
from pyspark.mllib.clustering import LDA, LDAModel from pyspark.mllib.linalg import Vectors try: # create SparkContext on all CPUs available: in my case I have 4 CPUs on my laptop sc = ps.SparkContext('local[4]') print("Just created a SparkContext") except ValueError: warnings.warn("SparkContext already exists in this scope") # Load and parse the data data = sc.textFile("spark_sample_data/sample_lda_data.txt") parsedData = data.map( lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')])) # Index documents with unique IDs corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() print(corpus.take(10)) # Cluster the documents into three topics using LDA ldaModel = LDA.train(corpus, k=3, optimizer='online') # Output topics. Each is a distribution over words (matching word count vectors) print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):") topics = ldaModel.topicsMatrix() for topic in range(3): print("Topic " + str(topic) + ":") for word in range(0, ldaModel.vocabSize()): print(" " + str(topics[word][topic]))
def toSparseVector(corpusLine, nFeatures): v = {idx: val for idx, val in corpusLine} return Vectors.sparse(nFeatures, v) # nSamples = len(corpus) nFeatures = len(dic) corpusParallel = sc.parallelize(corpus) corpusMapped = corpusParallel.map(lambda doc: toSparseVector(doc, nFeatures)) # Index documents with unique IDs corpusIndexed = corpusMapped.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() nTopics = 10 ldaModel = LDA.train(corpusIndexed, k=nTopics) # Dirty trick -- use sklearn LDA to do the transform step # This should be possible on Spark, but can't figure out how from sklearn.decomposition import LatentDirichletAllocation lda = LatentDirichletAllocation(n_topics=nTopics, max_iter=1, learning_method='online', learning_offset=50. ) doc0 = corpusIndexed.first()[1].toArray() lda.fit(doc0) lda.components_ = ldaModel.topicsMatrix().T def getDocumentTopics(docTokens, lda): wcTuples = dic.doc2bow(docTokens)
# First, we're going to identify the top words in the corpus and only keep track of those words. # Those top words will form our vocabulary. word_counts = sentences.flatMap(lambda s: s.split(" ")).map( lambda w: (w.lower(), 1)).reduceByKey(lambda a, b: a + b) top_words = word_counts.takeOrdered(500, key=lambda (w, c): -c) vocabulary = [str(k) for (k, v) in top_words] # We also want a Broadcast version of the vocabulary list. br_vocabulary = sc.broadcast(vocabulary) # Next, we need to convert the raw text sentences into a dense-vector representation. dense_vectors = sentences.map(lambda s: vectorizer(s, br_vocabulary)) # Finally, we create our corpus by giving each sentence an ID. corpus = dense_vectors.zipWithIndex().map(lambda (v, i): [i, v]) # Now we can train an LDA model on our data. lda_model = LDA.train(corpus, k=3, maxIterations=20) # Output topics. For each topic, print out the top words contributing to that topic. print("Learned topics (as distributions over vocab of " + str(lda_model.vocabSize()) + " words):") topics = lda_model.topicsMatrix() for topic in range(topics.shape[1]): print("Topic " + str(topic) + ":") topic_word_counts = sorted(zip(vocabulary, lda_model.topicsMatrix()[:, topic]), key=lambda (w, c): -c) top_words = [w for (w, c) in topic_word_counts[:30]] print top_words
from pyspark.mllib.clustering import LDA, LDAModel from pyspark.mllib.linalg import Vectors # $example off$ if __name__ == "__main__": sc = SparkContext(appName="LatentDirichletAllocationExample") # SparkContext # $example on$ # Load and parse the items data = sc.textFile("/home/pipi/files/DATASETS/SparkMLlib/sample_lda_data.txt") parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')])) # Index documents with unique IDs corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() # Cluster the documents into three topics using LDA ldaModel = LDA.train(corpus, k=3) exit() # Output topics. Each is a distribution over words (matching word count vectors) print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):") topics = ldaModel.topicsMatrix() for topic in range(3): print("Topic " + str(topic) + ":") for word in range(0, ldaModel.vocabSize()): print(" " + str(topics[word][topic])) # Save and load model ldaModel.save(sc, "target/org/apache/spark/PythonLatentDirichletAllocationExample/LDAModel") sameModel = LDAModel\ .load(sc, "target/org/apache/spark/PythonLatentDirichletAllocationExample/LDAModel")
'yyyy-MM-dd').alias('no_timestamp')).groupby('no_timestamp').count().sort( F.col('no_timestamp')) print(dates.show(dates.count())) dates.toPandas().plot(kind='line', x='no_timestamp', y='count') dates.toPandas().plot(kind='bar', x='no_timestamp') tokenizer = Tokenizer(inputCol="tweet", outputCol="words") prep_df = tokenizer.transform(df) cv_prep = CountVectorizer(inputCol="words", outputCol="prep") cv_model = cv_prep.fit(prep_df) ready_df = cv_model.transform(prep_df) # stopWords = [word for word in cv_prep.vocabulary if any(char.isdigit() for char in word)] # remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords = stopwords) # prep_df = remover.transform(prep_df) trainable = ready_df.select( 'tweet_id', 'prep').rdd.map(lambda x, y: [x, Vectors.fromML(y)]).cache() print("Trainable") print(trainable.take(10)) print("take") model = LDA.train(trainable, k=5, seed=1, optimizer="online") exit(0) #Print the topics in the model topics = model.describeTopics(maxTermsPerTopic=15) for x, topic in enumerate(topics): print('topic nr: ' + str(x)) words = topic[0] weights = topic[1] for n in range(len(words)): print(cv_prep.vocabulary[words[n]] + ' ' + str(weights[n]))
id_index = (clean .map(lambda (id, tokens): id) .zipWithIndex() .map(lambda (id, zID): (zID, id)) ) #------------------------------------------------------------------------------- # Snippet 5: model # LDA Model lda_model = LDA.train( rdd = tf_matrix, k = num_topics, maxIterations = 50, seed = 1300, optimizer = 'em' ) topics_matrix = sc.broadcast(lda_model.topicsMatrix()) # Document Topics doc_topics = (tf_matrix .map(lambda (zID, dv): (zID, dv.dot(topics_matrix.value))) .map(lambda (zID, res): (zID, res * (1 / np.sum(res)))) .join(id_index) .map(lambda (zID, (res, doc_id)): (doc_id, list(res))) ) # Topic Terms
sentences = tolstoy.filter(lambda s: len(s)>0) # We have a fair amount of data wrangling to do to get things into the right format for Spark's LDA. # First, we're going to identify the top words in the corpus and only keep track of those words. # Those top words will form our vocabulary. word_counts = sentences.flatMap(lambda s: s.split(" ")).map(lambda w: (w.lower(),1)).reduceByKey(lambda a,b : a+b) top_words = word_counts.takeOrdered(500,key=lambda (w,c):-c) vocabulary = [str(k) for (k,v) in top_words] # We also want a Broadcast version of the vocabulary list. br_vocabulary = sc.broadcast(vocabulary) # Next, we need to convert the raw text sentences into a dense-vector representation. dense_vectors = sentences.map(lambda s: vectorizer(s,br_vocabulary)) # Finally, we create our corpus by giving each sentence an ID. corpus = dense_vectors.zipWithIndex().map(lambda (v,i): [i, v]) # Now we can train an LDA model on our data. lda_model = LDA.train(corpus, k =3, maxIterations=20) # Output topics. For each topic, print out the top words contributing to that topic. print("Learned topics (as distributions over vocab of " + str(lda_model.vocabSize()) + " words):") topics = lda_model.topicsMatrix() for topic in range(topics.shape[1]): print("Topic " + str(topic) + ":") topic_word_counts = sorted(zip(vocabulary,lda_model.topicsMatrix()[:,topic]), key = lambda (w,c):-c ) top_words = [w for (w,c) in topic_word_counts[:30]] print top_words
columns=['hash', 'word']).set_index('hash') # hashingTF = HashingTF() # Tf-Idfの生成 tf = hashingTF.transform(documents) tf.cache() idf = IDF().fit(tf) tf_idf_data = idf.transform(tf) print dt.now().strftime('%Y/%m/%d %H:%M:%S') K = 5 # Index documents with unique IDs corpus_data = tf_idf_data.zipWithIndex().map( lambda x: [x[1], x[0]]).cache() print corpus_data # Cluster the documents into three topics using LDA ldaModel = LDA.train(corpus_data, k=K) # Output topics. Each is a distribution over words (matching word count vectors) print "Learned topics (as distributions over vocab of " + str( ldaModel.vocabSize()) + " words):" topics = ldaModel.topicsMatrix() print dt.now().strftime('%Y/%m/%d %H:%M:%S') def idx_to_word(idx): res = hashed_word.ix[idx].word if type(res) == pd.Series: return res.to_dict().values()[0] else: return res rep_num = 20
from pyspark.mllib.linalg import Vectors from pyspark.mllib.clustering import LDA import numpy as np from numpy.testing import assert_almost_equal, assert_equal sc = SparkContext('local[*]', appName='Word2Vec') data = [ [1, Vectors.dense([0.0, 1.0, 0.5])], [3, Vectors.dense([0.9, 1.2, 0.4])]] rdd = sc.parallelize(data) model = LDA.train( rdd, k=2, maxIterations=40, docConcentration=-1.0, topicConcentration=-1.0, seed=100, checkpointInterval=10, optimizer='em') print model.vocabSize() print model.topicsMatrix() # topics = model.describeTopics(1) topics = model.topicsMatrix() for word in topics: print word # topics_rdd = topics.rdd # topics_words = topics_rdd\
p = rescaledData.select('features') p = p.limit(650000) # you can choose number or comments you want to run LDA on #p.count() #p.show(3) import threading import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO, filename='running.log', filemode='w') #Calculating LDA: start = time() lda = LDA(k=20, maxIter=500) model = lda.fit(p) print('used LDA: {:.2f}s'.format(time() - start)) #model.isDistributed() #start = time() #ll = model.logLikelihood(p) #lp = model.logPerplexity(p) #print("The lower bound on the log likelihood of the entire corpus: " + str(ll)) #print("The upper bound on perplexity: " + str(lp)) #print ('used: {:.2f}s'.format(time()-start)) start = time() # Describe topics. topics = model.describeTopics(15)