Ejemplo n.º 1
0
def latent_dirichlet_allocation(unclustered_data,
                                number_of_clusters,
                                max_iterations=20,
                                doc_concentration=-1.0,
                                topic_concentration=-1.0,
                                seed=None,
                                checkpoint_interval=10,
                                optimizer='em'):

    if number_of_clusters < 1:
        raise ValueError("While clustering with LDA, \
                the given number of clusters is not positive")

    parsedData = unclustered_data.map(lambda lst: Vectors.dense(lst))
    corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
    ldaModel = LDA.train(rdd=corpus,
                         k=number_of_clusters,
                         maxIterations=max_iterations,
                         docConcentration=doc_concentration,
                         topicConcentration=topic_concentration,
                         seed=seed,
                         checkpointInterval=checkpoint_interval,
                         optimizer=optimizer)
    topics = ldaModel.topicsMatrix()
    return [ldaModel, topics]
Ejemplo n.º 2
0
def getKeywordsInDataRange(sDF,
                           oldestTime,
                           newestTime,
                           topics=1,
                           wordsPerTopic=20):  #yyyy-MM-dd
    #Filter
    oldestTime = datetime.strptime(oldestTime, '%Y-%m-%d')
    newestTime = datetime.strptime(newestTime, '%Y-%m-%d')

    filteredText = sDF\
                    .select( "id", date_format('day','yyyy-MM-dd').alias('time'), col("title").alias("text") )\
                    .where( (col("time") >= oldestTime) & (col("time") <= newestTime) )

    #StartPipeline for preparing data
    textToWords = RegexTokenizer(
        inputCol="text", outputCol="splitted",
        pattern="[\\P{L}]+")  #Remove signs and split by spaces
    stopRemover = StopWordsRemover(
        inputCol="splitted",
        outputCol="words",
        stopWords=StopWordsRemover.loadDefaultStopWords("english"))
    countVectorizer = CountVectorizer(inputCol="words", outputCol="features")
    pipeline = Pipeline(stages=[textToWords, stopRemover, countVectorizer])

    #GetCorups for LDA
    try:
        model = pipeline.fit(filteredText)
    except IllegalArgumentException:
        return []
    result = model.transform(filteredText)
    corpus = result.select("id", "features").rdd.map(
        lambda r: [mhash(r.id) % 10**8,
                   Vectors.fromML(r.features)]).cache()

    # Cluster the documents into k topics using LDA
    ldaModel = LDA.train(corpus,
                         k=topics,
                         maxIterations=100,
                         optimizer='online')
    topics = ldaModel.topicsMatrix()
    vocabArray = model.stages[2].vocabulary  #CountVectorizer
    topicIndices = spark.sparkContext.parallelize(
        ldaModel.describeTopics(maxTermsPerTopic=wordsPerTopic))

    def topic_render(topic):  # specify vector id of words to actual words
        terms = topic[0]
        result = []
        for i in range(wordsPerTopic):
            term = vocabArray[terms[i]]
            result.append(term)
        return result

    # topics_final = topicIndices.map(lambda topic: topic_render(topic)).collect()
    # for topic in range(len(topics_final)):
    #     print ("Topic" + str(topic) + ":")
    #     for term in topics_final[topic]:
    #         print (term)
    #     print ('\n')
    return topicIndices.map(lambda topic: topic_render(topic)).collect()
Ejemplo n.º 3
0
def runLDA(data,NumberOfTopics):
	#print("\n\n\n%")
	#print(data)
	data = sc.parallelize(data)
	#print("$")
	#print(data)
	#print("\n\n\n")
	model = LDA.train(data,k=NumberOfTopics,seed=1)
	return model
Ejemplo n.º 4
0
def main(*x, **r):
    l = x
    dataset = "hdfs://" + r['ip'] + "/user/" + r['user'] + "/In/" + r['file']
    sc = r['sprkcontext']

    base = os.path.abspath(os.path.dirname(__file__))
    path = os.path.join(base, 'tuned', r['file'], str(r['label']))
    start_time = time.time()
    if not os.path.exists(path):
        os.makedirs(path)
    b = int(l[0])
    path1 = path + "/K_" + str(b) + "_a_" + str(l[1]) + "_b_" + str(
        l[2]) + ".txt"
    with open(path1, "w") as f:
        f.truncate()
    fo = open(path1, 'w+')
    score_topic = []
    corpus, vocabArray = preprocess(sc,
                                    path=dataset,
                                    vocabsize=50000,
                                    stopwordfile='')
    #corpus.cache()
    for i in range(10):
        fo.write("Run : " + str(i) + "\n")

        x = corpus.collect()
        shuffle(x)
        corpus = sc.parallelize(x)
        #corpus.cache()
        ldaModel = LDA.train(corpus,
                             k=int(l[0]),
                             maxIterations=20,
                             docConcentration=float(l[1]),
                             topicConcentration=float(l[1]),
                             checkpointInterval=10,
                             optimizer='online')
        # println(s"\t $distLDAModel.topicsMatrix().toArray()")
        topicIndices = ldaModel.describeTopics(maxTermsPerTopic=10)
        topics = []
        for x in topicIndices:
            topics.append(
                zip(list(map(lambda a: str(vocabArray[int(a)]), x[0])), x[1]))
        for a in range(len(topics)):
            fo.write("Topic " + str(a) + ": ")
            str1 = ''
            for b in topics[a]:
                str1 += b[0] + " "
                fo.write(b[0] + " ")
            score_topic.append(str1)
            fo.write("\n")
        fo.write("\n")
    b = jaccard(int(l[0]), tops=score_topic, term=r['label'])
    fo.write("\nRuntime: --- %s seconds ---\n" % (time.time() - start_time))
    fo.write("\nScore: " + str(b))
    fo.close()
    return b
Ejemplo n.º 5
0
    def train():
        data = sc.textFile(corpus_filename).map(lambda line: Vectors.dense([float(i) for i in line.strip().split()]))
        corpus = data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
        # print(corpus.take(5))

        lda_model = LDA.train(rdd=corpus, maxIterations=max_iter, seed=seed, checkpointInterval=checkin_point_interval,
                              k=K,
                              optimizer=optimizer, docConcentration=alpha, topicConcentration=beta)
        if os.path.exists('./ldamodel'): __import__('shutil').rmtree('./ldamodel')
        lda_model.save(sc, "./ldamodel")
Ejemplo n.º 6
0
def lda_spark(sc, X=None, clusters=3):
	if X is None:
		X = users_as_parallelizable_sparse_data(users)
	X = sc.parallelize(X)
	X = X.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
	ldaModel = LDA.train(X, k=clusters)
	topics = ldaModel.topicsMatrix()
	f, (ax1) = sns.plt.subplots(1, sharex=False, sharey=False)
	f.suptitle("Results of running LDA on spark", fontsize=20)
	ax1.set_title("Heatmap over topics matrix")
	sns.heatmap(topics, ax=ax1)
Ejemplo n.º 7
0
def main():
    for tn in tablenames:
        data = spark.read.format("org.apache.spark.sql.cassandra")\
                    .options(table=tn, keyspace=keyspace).load().limit(1000)

        data = data.sort('imdb_score', ascending=False)

        desc = data.rdd.map(lambda x: x['description']).filter(
            lambda x: x is not None)

        StopWords = nltk.corpus.stopwords.words('english')
        StopWords.extend([" ...                See full summary"])

        tokenized = desc.map( lambda y: y.strip().lower()).map( lambda x: re.split(" ", x))\
            .map( lambda word: [x for x in word if x.isalpha()]).map( lambda word: [x for x in word if len(x) > 3] )\
            .map( lambda word: [x for x in word if x not in StopWords]).zipWithIndex()

        df_txts = spark.createDataFrame(tokenized, ["words", 'index'])
        countVec = CountVectorizer(inputCol="words",
                                   outputCol="raw_features",
                                   vocabSize=5000,
                                   minDF=10.0)
        CountVectMod = countVec.fit(df_txts)
        result = CountVectMod.transform(df_txts)
        idf = IDF(inputCol="raw_features", outputCol="features")
        idfModel = idf.fit(result)
        resultTFIdf = idfModel.transform(result)

        totalTopics = 10
        totalItr = 100
        LDAModel = MLlibLDA.train(resultTFIdf.select('index','features').rdd.mapValues(MLlibVectors.fromML).map(list),\
                        k=totalTopics, maxIterations=totalItr)

        maxwordsTopic = 5
        topicIndices = sc.parallelize(
            LDAModel.describeTopics(maxTermsPerTopic=5))
        VCarr = CountVectMod.vocabulary

        def finalTopic(topic):
            terms = topic[0]
            result = []
            for i in range(maxwordsTopic):
                term = VCarr[terms[i]]
                result.append(term)
            return result

        topics_final = topicIndices.map(
            lambda topic: finalTopic(topic)).collect()
        print(topics_final)
        for topic in range(len(topics_final)):
            print("Topic" + str(topic) + ":")
            for term in topics_final[topic]:
                print(term)
            print('\n')
Ejemplo n.º 8
0
def lda_spark(sc, X=None, clusters=3):
    if X is None:
        X = users_as_parallelizable_sparse_data(users)
    X = sc.parallelize(X)
    X = X.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
    ldaModel = LDA.train(X, k=clusters)
    topics = ldaModel.topicsMatrix()
    f, (ax1) = sns.plt.subplots(1, sharex=False, sharey=False)
    f.suptitle("Results of running LDA on spark", fontsize=20)
    ax1.set_title("Heatmap over topics matrix")
    sns.heatmap(topics, ax=ax1)
def LDA_spark():
	data = sc.textFile("data/mllib/sample_lda_data.txt")
	parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
	corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

	# Cluster the documents into three topics using LDA
	Model = LDA.train(corpus, k=3)
			
	# Save and load model
	Model.save(sc, "myModelPath")
	sameModel = LDAModel.load(sc, "myModelPath")
Ejemplo n.º 10
0
def train_model():
    sc = SparkContext(appName='lda_train', conf=conf)
    from pyspark.sql import SQLContext
    sqlContext = SQLContext(sc)

    model_vectors = sqlContext.read.parquet(
        '/user/rmusters/jan_threshold20_2015model99/data')
    logger.info("model loaded")
    rdd_words = model_vectors.map(lambda line: line[0])
    words = rdd_words.collect()  #15919
    logger.info("Amount of words collected: %i", len(words))

    path = 'hdfs:///user/rmusters/data_jan_sample'
    data = sqlContext.read.parquet(path)

    # logger.info("data loaded")
    # data = data.sample(False, 0.01)
    # logger.info("data sampled")

    def bow(filtered_text):
        word_dict = {}
        vector_dict = {}
        for i, v in enumerate(words):
            word_dict[v] = i
            vector_dict[i] = 0
        for w in filtered_text:
            if w in words:
                vector_dict[word_dict[w]] = vector_dict[word_dict[w]] + 1
        return vector_dict

    #check if sum of vector is zero 13 times. This indicates the datasample does not contain certain words and thus the sparse vector removes them
    from pyspark.mllib.linalg import SparseVector
    size = len(words)
    logger.info("size of words: %i", size)

    #bag of words is used to train LDA
    data = data.map(lambda (text, filtered_text, id): (
        text, filtered_text, SparseVector(size, bow(filtered_text)), id))
    logger.info("bag of words data")

    df = data.toDF(["text", "filtered_text", "vectors", "id"])
    df.write.parquet("hdfs:///user/rmusters/lda_data_jan", mode="overwrite")

    corpus = data.map(lambda (text, filtered_text, vector, id): [id, vector])

    logger.info("Training the lda model")
    ldaModel = LDA.train(corpus, k=500)
    logger.info("Vocabsize is: %i", ldaModel.vocabSize())

    ldaModel.save(sc, 'hdfs:///user/rmusters/ldaModel_jan')
    logger.info("model saved")
Ejemplo n.º 11
0
def runOnlineLDA(data, numOfTags, K = 10):
  '''
  require preprocessed input data
  input format:
    (key, [values])
  All values would be equally weighted.
  '''
  corpus = data.map(lambda (key, values): list(values)) \
               .filter(lambda p: len(p) >= 2) \
               .zipWithIndex().map(lambda (p, index): (index + 1, p)) \
               .mapValues(lambda p: SparseVector(numOfTags, {val: 1.0 for val in p})) \
               .map(lambda (index, values): [index, values]).cache()

  return LDA.train(corpus, K, optimizer='online')
Ejemplo n.º 12
0
def LDAThis(sc, RDD, minFreq, numTopics, maxIter, wordsPerTopic):
    '''
Arguments:
     sc: A SparkContext Object
     RDD: An RDD with rows as tokenized sentences
     minFreq: Minimum document frequency for CountVectorizer
     numTopics: Number of Topics
     maxIter: Max number of iterations for LDA train
     wordsPerTopic: Number of words to show per topic
     topWords: Number of words to show per topic
Requirements
     sqlContext = SQLContext(sc) <- must be defined outside function
     '''
    StopWords = stopwords.words("english")
    sqlContext = SQLContext(sc)
    # Structure Data
    idRDD = RDD.map(
        lambda words: [x for x in words if x.isalpha() and x not in StopWords
                       ]).filter(lambda x: len(x) > 2).zipWithIndex()
    idDF = sqlContext.createDataFrame(idRDD, ["tokens", 'index'])
    # Term Frequency
    CVecModel = CountVectorizer(inputCol="tokens",
                                outputCol="rawFeatures",
                                vocabSize=5000,
                                minDF=minFreq).fit(idDF)
    resultCVec = CVecModel.transform(idDF)
    vocabArray = CVecModel.vocabulary
    #IDF
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(resultCVec)
    resultTFIDF = idfModel.transform(resultCVec)
    # LDA
    resultLDA = LDA.train(resultTFIDF.select(
        'index', 'features').rdd.mapValues(Vectors.fromML).map(list),
                          k=numTopics,
                          maxIterations=maxIter)
    topicIndices = sc.parallelize(
        resultLDA.describeTopics(maxTermsPerTopic=wordsPerTopic))
    topicsFinal = topicIndices.map(lambda topic: render_topics(
        topic, wordsPerTopic, vocabArray)).collect()

    # Show Topics
    for topic in range(len(topicsFinal)):
        print("Topic" + str(topic) + ":")
        for term in topicsFinal[topic]:
            print(term)
        print('\n')
    return resultLDA
Ejemplo n.º 13
0
def runLDA(filepath, n):
    data = sc.textFile(filepath)
    n_vcb = int(data.take(2)[1])

    parsedData = data.map(lambda line: line.strip().split(' ')).filter(
        lambda x: len(x) > 2).map(lambda x: (int(x[0]) - 1, (int(x[
            1]) - 1, float(x[2])))).groupByKey().mapValues(list)
    corpus = parsedData.map(
        lambda x: [x[0], Vectors.sparse(n_vcb, x[1])]).cache()

    ldaModel = LDA.train(corpus, k=n)

    print "Learned topics (as distributions over vocab of " + str(
        ldaModel.vocabSize()) + " words):"
    print ldaModel.describeTopics(maxTermsPerTopic=20)
    return ldaModel.topicsMatrix()
Ejemplo n.º 14
0
def A1():  #1) apply LDA and find topics in user's posts (including reposts)
    textToWords = RegexTokenizer(
        inputCol="text", outputCol="splitted",
        pattern="[\\P{L}]+")  #Remove signs and split by spaces
    stopRemover = StopWordsRemover(
        inputCol="splitted",
        outputCol="words",
        stopWords=StopWordsRemover.loadDefaultStopWords("russian") +
        StopWordsRemover.loadDefaultStopWords("english"))
    countVectorizer = CountVectorizer(inputCol="words", outputCol="features")

    #Filter if post id exists?
    data = uWallP\
        .filter( uWallP.text != "" )\
        .select("id","text")\
        .limit(10)\

    pipeline = Pipeline(stages=[textToWords, stopRemover, countVectorizer])
    model = pipeline.fit(data)
    result = model.transform(data)
    corpus = result.select("id", "features").rdd.map(
        lambda r: [r.id, Vectors.fromML(r.features)]).cache()

    # Cluster the documents into k topics using LDA
    ldaModel = LDA.train(corpus, k=8, maxIterations=100, optimizer='online')
    topics = ldaModel.topicsMatrix()
    vocabArray = model.stages[2].vocabulary  #CountVectorizer
    wordNumbers = 20  # number of words per topic
    topicIndices = spark.sparkContext.parallelize(
        ldaModel.describeTopics(maxTermsPerTopic=wordNumbers))

    def topic_render(topic):  # specify vector id of words to actual words
        terms = topic[0]
        result = []
        for i in range(wordNumbers):
            term = vocabArray[terms[i]]
            result.append(term)
        return result

    topics_final = topicIndices.map(
        lambda topic: topic_render(topic)).collect()

    for topic in range(len(topics_final)):
        print("Topic" + str(topic) + ":")
        for term in topics_final[topic]:
            print(term)
        print('\n')
Ejemplo n.º 15
0
def word_topics(num_topics=NUM_TOPICS, num_words_per_topics=NUM_WORDS_PER_TOPICS):
    """Generates topics from word clusters.

    Arguments:
        num_topics {integer} -- Number of topics to infer
        num_words_per_topics {integer} -- Number of terms to collect for each topic

    Returns:
        None
    """
    spark = init_spark(AITA_CLEANED_COLLECTION)
    data_rdd = spark.read.format('mongo').load().rdd

    preprocessed_rdd = data_rdd\
        .flatMap(lambda row: [row['header'].lower().split(' ') + row['content'].lower().split(' ')]) \
        .zipWithIndex() \
        .map(lambda x: Row(index=x[1], words=x[0]))

    preprocessed_df = spark.createDataFrame(preprocessed_rdd)

    cv = CountVectorizer(inputCol='words', outputCol='vectors')
    model = cv.fit(preprocessed_df)
    vector_df = model.transform(preprocessed_df)

    corpus = vector_df.select('index', 'vectors').rdd.map(lambda x: [x[0], Vectors.fromML(x[1])]).cache()

    lda_model = LDA.train(corpus, k=num_topics, maxIterations=100, optimizer='online')
    vocab_array = model.vocabulary

    topic_indices = spark.sparkContext.parallelize(lda_model.describeTopics(maxTermsPerTopic=num_words_per_topics))

    def vector_id_to_word(topic):
        terms = topic[0]
        weights = topic[1]
        result = []
        for i in range(num_words_per_topics):
            result.append((vocab_array[terms[i]], weights[i]))
        return result

    topics = topic_indices.map(lambda topic: vector_id_to_word(topic)).collect()

    for i in range(len(topics)):
        print('Topic {}:'.format(i))
        for item in topics[i]:
            print(item)
        print('\n')
Ejemplo n.º 16
0
    def train():
        data = sc.textFile(corpus_filename).map(lambda line: Vectors.dense(
            [float(i) for i in line.strip().split()]))
        corpus = data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
        # print(corpus.take(5))

        lda_model = LDA.train(rdd=corpus,
                              maxIterations=max_iter,
                              seed=seed,
                              checkpointInterval=checkin_point_interval,
                              k=K,
                              optimizer=optimizer,
                              docConcentration=alpha,
                              topicConcentration=beta)
        if os.path.exists('./ldamodel'):
            __import__('shutil').rmtree('./ldamodel')
        lda_model.save(sc, "./ldamodel")
Ejemplo n.º 17
0
def main():
	p = sys.argv[1]
	logFile = "data/" + p + "_cleaned.txt"
	sc = SparkContext("local", "simpleApp")
	data = sc.textFile(logFile).cache()
	numberoftweets = data.count()
	words = data.flatMap(lambda x: x.split(" ")).distinct()
	word_list = words.collect()
	words = words.flatMap(lambda x:d(x,numberoftweets))
	data = data.zipWithIndex().map(lambda (x,y): (y,x.split(" ")))
	wc = data.flatMap(lambda x: func(x)).groupByKey().mapValues(lambda x: len(x))
	mat = words.leftOuterJoin(wc).map(lambda (x,y):  (x[0],(x[1], f(y[0],y[1])))).groupByKey().sortByKey().mapValues(lambda x:list(x)).mapValues(lambda x: ok(x))
	parsedData = mat.mapValues(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')])).map(lambda (x,y): [x,y])

	# Index documents with unique IDs
	corpus = parsedData.cache()

	# Cluster the documents into three topics using LDA
	ldaModel = LDA.train(corpus, k=3)

	# Output topics. Each is a distribution over words (matching word count vectors)
	topics = ldaModel.topicsMatrix()
	topics_dict={}
	for topic in range(3):
		k = "Topic "+ str(topic)
		topics_dict[k] = {}
		for word in range(0, ldaModel.vocabSize()):
			topics_dict[k][str(topics[word][topic])] = word_list[word]

	# path = "data/" + p + "_results.txt"
	# json = open(path, 'wb')

	# from chardet import detect
	# encoding = lambda x: detect(x)['encoding']

	for i in topics_dict.keys():
		counter=0
		z = sorted(topics_dict[i],reverse=True)
		for l in z:
			if counter == 7: break
			line = topics_dict[i][l] + " "
			counter+=1
			string_for_output = line.encode('utf8', 'replace')
			print(line)
Ejemplo n.º 18
0
def main():
    # 初始化 SparkContext
    sc = spark_context(spark_master)

    # 加载数据
    data = sc.textFile(hdfs_path)

    # 计算词频
    documents = data.map(tokenize)
    hashingTF = HashingTF(2 << 10)
    tf = hashingTF.transform(documents)

    # 对文档词频进行索引
    corpus = tf.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

    # 索引和词的映射
    mapping = hashing_term_mapping(documents)
    mapping.cache()

    # 训练 LDA 模型
    ldaModel = LDA.train(corpus, k=3)

    # 链接到 MongoDB
    from pymongo import MongoClient
    mongo_client = MongoClient(mongo_host)
    mongo_client.admin.authenticate(mongo_user,
                                    mongo_pass,
                                    mechanism='SCRAM-SHA-1')
    clear_mongodb(mongo_client)

    # 保存结果到 MongoDB
    topics = ldaModel.describeTopics(maxTermsPerTopic=10)
    for topic in range(3):
        doc = {}
        doc['name'] = "topic " + str(topic)
        doc['terms'] = []
        for i in range(10):
            term_index = topics[topic][0][i]
            for term in mapping.lookup(term_index):
                doc['terms'].append([term.encode("utf8"), topics[topic][1][i]])
        send_mongodb(mongo_client, doc)
Ejemplo n.º 19
0
def getCellLDA(Documents, NumberOfTopics):
	#print("\n\n\n/////////////////////////////////////////////////////////////////////////////////////")
	#print(Documents.collect())""""""
	corpus = sc.parallelize(Documents)
	term_counts = corpus.flatMap(lambda x: x).map(lambda x: (x,1)).reduceByKey(add)
	#print(term_counts.collect())
	vocabulary = term_counts.map(lambda x: x[0]).zipWithIndex().collectAsMap()
	#print("_____________________________________________________________________________________")
	#print(vocabulary)
	#print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
	
	
	documents = corpus.zipWithIndex().map(lambda doc: documentToSparseVector(doc,vocabulary)).map(list)
	#print(documents.collect())
	#print("*************************************************************************************")
	
	lda = LDA.train(documents, k=NumberOfTopics, seed=1)
	#print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
	#print(result)
	#print("|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||")
	return lda, vocabulary
Ejemplo n.º 20
0
def LDA_Treatment(str):
    finalTopics = []
    txt = wordTokenize(str)
    data = sc.parallelize([txt]).zipWithIndex().map(lambda val: Row(idd=val[1], _words=val[0].split(" ")))
    docDF = spark.createDataFrame(data, ["_words"])
    Vector = CountVectorizer(inputCol="_words", outputCol="vectors")
    model = Vector.fit(docDF)
    result = model.transform(docDF)
    corpus = result.select("idd", "vectors").rdd.map(lambda val: [val[0], Vectors.fromML(val[1])]).cache()
    ldaModel = LDA.train(corpus, k=nbTopics, maxIterations=1000, optimizer='online')
    topics = ldaModel.topicsMatrix()
    vocabArray = model.vocabulary
    topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic=wordNumbers))
    topics_final = topicIndices.map(lambda topic: topic_render(topic, vocabArray)).collect()

    for topic in range(len(topics_final)):
        for term in topics_final[topic]:
            term = unidecode.unidecode(term)
            finalTopics.append(term)

    return finalTopics
Ejemplo n.º 21
0
def main():
    # 初始化 SparkContext
    sc = spark_context(spark_master)

    # 加载数据
    data = sc.textFile(hdfs_path)

    # 计算词频
    documents = data.map(tokenize)
    hashingTF = HashingTF(2 << 10)
    tf = hashingTF.transform(documents)

    # 对文档词频进行索引
    corpus = tf.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

    # 索引和词的映射
    mapping = hashing_term_mapping(documents)
    mapping.cache()

    # 训练 LDA 模型
    ldaModel = LDA.train(corpus, k=3)

    # 链接到 MongoDB
    from pymongo import MongoClient

    mongo_client = MongoClient(mongo_host)
    mongo_client.admin.authenticate(mongo_user, mongo_pass, mechanism="SCRAM-SHA-1")
    clear_mongodb(mongo_client)

    # 保存结果到 MongoDB
    topics = ldaModel.describeTopics(maxTermsPerTopic=10)
    for topic in range(3):
        doc = {}
        doc["name"] = "topic " + str(topic)
        doc["terms"] = []
        for i in range(10):
            term_index = topics[topic][0][i]
            for term in mapping.lookup(term_index):
                doc["terms"].append([term.encode("utf8"), topics[topic][1][i]])
        send_mongodb(mongo_client, doc)
Ejemplo n.º 22
0
def main():
	p = sys.argv[1]
	logFile = "data/" + p + "_cleaned.txt"
	sc = SparkContext("local", "simpleApp")
	sqlContext = SQLContext(sc)
	data = sc.textFile(logFile).zipWithIndex().map(lambda (words,idd): Row(idd= idd, words = words.split(" "))).cache()
	docDF = sqlContext.createDataFrame(data)
	Vector = CountVectorizer(inputCol="words", outputCol="vectors")
	model = Vector.fit(docDF)
	result = model.transform(docDF)
	corpus_size = result.count()

	corpus = result.select("idd", "vectors").map(lambda (x,y): [x,y]).cache()

	# Cluster the documents into three topics using LDA
	ldaModel = LDA.train(corpus, k=3,maxIterations=100,optimizer='online')
	topics = ldaModel.topicsMatrix()
	wordNumbers = 10
	topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic = wordNumbers))
	vocabArray = model.vocabulary
	topics_final = topicIndices.map(lambda topic: topic_render(topic,wordNumbers,vocabArray)).collect()

	path = "data/" + p + "_results.txt"
	json = open(path, 'wb')
	json.close()

	for topic in topics_final:
		for term in topic:
			line = term[0] + " "

			try:
				string_for_output = line.encode('utf8', 'replace')
				if string_for_output != " ":
					os.system("python3 basic/codes/p3p.py " +  string_for_output + "  >> " + path)
			except: pass

		os.system("python3 basic/codes/p3p.py " +  "delmch" + "  >> " + path)
Ejemplo n.º 23
0
def getCellLDA(Documents, NumberOfTopics, OnlineOptimizer):
    #print("\n\n\n/////////////////////////////////////////////////////////////////////////////////////")
    #print("F**K OFF")
    #print("docs:")
    #print(Documents.collect())
    corpus = Documents  #.flatMap(lambda x: x[1])#sc.parallelize(Documents)
    #print("corpus:")
    #print(corpus.collect())
    term_counts = corpus.flatMap(lambda x: x).map(
        lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
    #term_counts.cache()
    #print("term_counts:")
    #print(term_counts)
    #print(term_counts.collect())
    #print(term_counts.collect())
    #print("vocabulary:")
    vocabulary = term_counts.map(lambda x: x[0]).zipWithIndex().collectAsMap()
    #print(vocabulary)
    #print("______________________________________________________________________________________\n\n")
    #print("_____________________________________________________________________________________")
    #print(vocabulary)
    #print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")

    documents = corpus.zipWithIndex().map(
        lambda doc: documentToSparseVector(doc, vocabulary)).map(list)
    #print(documents.collect())
    #print("*************************************************************************************")

    Optimizer = "online" if OnlineOptimizer else "em"
    lda = LDA.train(documents,
                    k=NumberOfTopics,
                    maxIterations=20,
                    optimizer=Optimizer)
    #print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    #print(result)
    #print("|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||")
    return lda, vocabulary  #[lda.describeTopics(), vocabulary]
Ejemplo n.º 24
0
from pyspark.sql import SQLContext, Row
sc = SparkContext()
# input file is a term-document matrix, which is generated by make_tdm.py
data = sc.textFile(
    "/Users/Zhen/Desktop/Courses/BigData/stackexchange/topicModeling/result/matrix.csv"
)
header = data.first()  #extract header
data = data.filter(lambda x: x != header)
data = data.map(
    lambda line: Vectors.dense([float(x) for x in line.strip().split(',')]))

# Index documents with unique IDs
corpus = data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

# Cluster the documents into k topics using LDA
ldaModel = LDA.train(corpus, k=30)

# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " +
      str(ldaModel.vocabSize()) + " words):")
topics = ldaModel.topicsMatrix()
# for topic in range(3):
#     print("Topic " + str(topic) + ":")
#     for word in range(0, ldaModel.vocabSize()):
#         print(" " + str(topics[word]))

import numpy
numpy.savetxt(
    "/Users/Zhen/Desktop/Courses/BigData/stackexchange/topicModeling/result/lda_topicMatrix.csv",
    topics,
    delimiter=",")
Ejemplo n.º 25
0
            score_topic = []
            for i in range(10):
                fo.write("Run : " + str(i) + "\n")
                ''''shuffle(l)
                    x=sc.parallelize(l).map(lambda d:(d,0))
                    corpus = corpus.map(lambda e: (e[0],e[1]))
                    print(x.collect())
                    corpus=corpus.join(x).map(lambda e: [e[0],e[1][0]] )'''

                x = corpus.collect()
                shuffle(x)
                corpus = sc.parallelize(x)
                ldaModel = LDA.train(corpus,
                                     k=10,
                                     maxIterations=20,
                                     docConcentration=-1.0,
                                     topicConcentration=-1.0,
                                     checkpointInterval=10,
                                     optimizer='online')
                topicIndices = ldaModel.describeTopics(maxTermsPerTopic=10)
                topics = []
                for x in topicIndices:
                    topics.append(
                        zip(list(map(lambda a: str(vocabArray[int(a)]), x[0])),
                            x[1]))
                for a in range(len(topics)):
                    fo.write("Topic " + str(a) + ": ")
                    str1 = ''
                    for b in topics[a]:
                        str1 += b[0] + " "
                        fo.write(b[0] + " ")
# Load and parse the data
#data = to_list[0:1000]
from pyspark.sql.types import StringType
from pyspark.sql.functions import *

corpus_df = spark.createDataFrame(data, StringType())
corpus_df = corpus_df.withColumn("index",monotonically_increasing_id())
corpus_df = corpus_df.withColumn("arrayColumn", array("value"))
#data = sc.textFile(path).zipWithIndex().map(lambda (words,idd): Row(idd= idd, words = words.split(" ")))
#docDF = spark.createDataFrame(data)
Vector = CountVectorizer(inputCol="arrayColumn", outputCol="vectors")
model = Vector.fit(corpus_df)
result = model.transform(corpus_df)

# Cluster the documents into three topics using LDA
ldaModel = LDA.train(data, k=10)

#ldaModel.save(sc, "target/org/apache/spark/PythonLatentDirichletAllocationExample/LDAModel")
#sameModel = LDAModel\
#    .load(sc, "target/org/apache/spark/PythonLatentDirichletAllocationExample/LDAModel")

# COMMAND ----------

num_topics = 10
max_iterations = 100

lda_model = LDA.train(result[['index','vectors']], k=num_topics, maxIterations=max_iterations)

# COMMAND ----------

result.show(truncate=False)
Ejemplo n.º 27
0
            counts[token_id] += 1
    counts = sorted(counts.items())
    keys = [x[0] for x in counts]
    values = [x[1] for x in counts]
    return (id, Vectors.sparse(len(vocabulary), keys, values))


# Process all of the documents into word vectors using the
# `document_vector` function defined previously
documents = tokens.zipWithIndex().map(document_vector).map(list)

# Get an inverted vocabulary, so we can look up the word by it's index value
inv_voc = {value: key for (key, value) in vocabulary.items()}

# Open an output file
with open("new_output.txt", 'w') as f:
    lda_model = LDA.train(documents,
                          k=num_topics,
                          maxIterations=max_iterations)
    topic_indices = lda_model.describeTopics(
        maxTermsPerTopic=num_words_per_topic)

    # Print topics, showing the top-weighted 10 terms for each topic
    for i in range(len(topic_indices)):
        f.write("Topic #{0}\n".format(i + 1))
        for j in range(len(topic_indices[i][0])):
            f.write("{0}\t{1}\n".format(inv_voc[topic_indices[i][0][j]] \
                                        .encode('utf-8'), topic_indices[i][1][j]))

    f.write("{0} topics distributed over {1} documents and {2} unique words\n" \
            .format(num_topics, documents.count(), len(vocabulary)))
Ejemplo n.º 28
0
for file_id in range(1,kFileNum):
    YOUR_FILE = "wet_data/CC-MAIN-20150728002301-%05d-ip-10-236-191-2.ec2.internal.warc.wet"%file_id
    YOUR_DELIMITER = "WARC/1.0"
    text_file= sc.newAPIHadoopFile(YOUR_FILE,"org.apache.hadoop.mapreduce.lib.input.TextInputFormat", "org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text", conf = {"textinputformat.record.delimiter":YOUR_DELIMITER}).map(lambda l:l[1])
    
    file_words = text_file.map(lambda file:file.replace('\n',' '))
    current_corpus = file_words.map(gen_vectors)	
    if total_corpus == None:
	total_corpus = current_corpus
    else:
	total_corpus = total_corpus.union(current_corpus)
total_corpus = total_corpus.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

kNumTopics = 10
# Cluster the documents into three topics using LDA
ldaModel = LDA.train(total_corpus, k=kNumTopics)

# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):")
topics = ldaModel.topicsMatrix()

topic_words = []

for topic in range(kNumTopics):
    word_weight = []
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        #print(" " + str(topics[word][topic]))
        word_weight.append((word,topics[word][topic]))
    sorted_word_weight = sorted(word_weight,key = lambda x:-x[1])
    print sorted_word_weight
Ejemplo n.º 29
0
#Create Tweet ID
from pyspark.sql.functions import monotonically_increasing_id
df = df.withColumn("tweet_id", monotonically_increasing_id())
df.show()

#Count Vectorizer
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol='ngrams',
                     outputCol='features',
                     vocabSize=100000,
                     minDF=2)
cvmodel = cv.fit(df)
result = cvmodel.transform(df)
result.show()

from pyspark.mllib.linalg import Vectors as MLlibVectors
from pyspark.mllib.clustering import LDA as MLlibLDA

#Train the LDA model
model = MLlibLDA.train(result.select("tweet_id", "features").rdd.mapValues(
    MLlibVectors.fromML).map(list),
                       k=3)
#Show Topics and weights
topics = model.describeTopics(maxTermsPerTopic=50)
for x, topic in enumerate(topics):
    print('topic number: ' + str(x))
    words = topic[0]
    weights = topic[1]
    for n in range(len(words)):
        print(cvmodel.vocabulary[words[n]] + ' ' + str(weights[n]))
Ejemplo n.º 30
0
    # for tweet in tweets:
    #     fd.write(tweet+'\n')

    rdd = sc.textFile('opinion.txt').zipWithIndex().map(
        lambda (words, idd): Row(idd=idd, words=words.split(" ")))
    docDF = spark.createDataFrame(rdd)
    Vector = CountVectorizer(inputCol="words", outputCol="vectors")
    model = Vector.fit(docDF)
    result = model.transform(docDF)

    corpus = result.select(
        "idd",
        "vectors").rdd.map(lambda (x, y): [x, Vectors.fromML(y)]).cache()

    # Cluster the documents into three topics using LDA
    ldaModel = LDA.train(corpus, k=5, maxIterations=100, optimizer='online')
    topics = ldaModel.topicsMatrix()
    vocabArray = model.vocabulary

    wordNumbers = 5  # number of words per topic
    topicIndices = sc.parallelize(
        ldaModel.describeTopics(maxTermsPerTopic=wordNumbers))

    def topic_render(topic):  # specify vector id of words to actual words
        terms = topic[0]
        result = []
        for i in range(wordNumbers):
            term = vocabArray[terms[i]]
            result.append(term)
        return result
Ejemplo n.º 31
0
sqlContext = SQLContext(sc)
path = ... # path of the txt file

data = sc.textFile(path).zipWithIndex().map(lambda (words,idd): Row(idd= idd, words = words.split(" ")))
docDF = sqlContext.createDataFrame(data)

Vector = CountVectorizer(inputCol="words", outputCol="vectors")
model = Vector.fit(docDF)
result = model.transform(docDF)

corpus_size = result.count()  # total number of words
corpus = result.select("idd", "vectors").map(lambda (x,y): [x,y]).cache()

# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=3,maxIterations=100,optimizer='online')
topics = ldaModel.topicsMatrix()
vocabArray = model.vocabulary

wordNumbers = 10  # number of words per topic
topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic = wordNumbers))

def topic_render(topic):  # specify vector id of words to actual words
    terms = topic[0]
    result = []
    for i in range(wordNumbers):
        term = vocabArray[terms[i]]
        result.append(term)
    return result
    
topics_final = topicIndices.map(lambda topic: topic_render(topic)).collect()
Ejemplo n.º 32
0
df_comments = sqlContext.createDataFrame(comments, ["list_of_words", 'index'])

# TF
cv = CountVectorizer(inputCol="list_of_words",
                     outputCol="raw_features",
                     vocabSize=50000,
                     minDF=10.0)
cvmodel = cv.fit(df_comments)
result_cv = cvmodel.transform(df_comments)
# IDF
idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv)

lda = LDA(k=3, maxIter=50)
model = lda.fit(result_tfidf[['index', 'features']])

transformed = model.transform(result_tfidf)
# transformed.show(truncate=False)
model.describeTopics(8).show()

# ll = model.logLikelihood(result_tfidf[['index','features']])
# lp = model.logPerplexity(result_tfidf[['index','features']])

vocabulary = {}
j = 0
for i in cvmodel.vocabulary:
    vocabulary[j] = i.encode("utf-8")
    j += 1
Ejemplo n.º 33
0
# TF
cv = CountVectorizer(inputCol="list_of_words",
                     outputCol="raw_features",
                     vocabSize=5000,
                     minDF=10.0)
cvmodel = cv.fit(df_txts)
result_cv = cvmodel.transform(df_txts)
# IDF
idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv)

num_topics = 10
max_iterations = 100
lda_model = LDA.train(result_tfidf[['index', 'features']].map(list),
                      k=num_topics,
                      maxIterations=max_iterations)

wordNumbers = 5
topicIndices = spark.parallelize(lda_model.describeTopics\
(maxTermsPerTopic = wordNumbers))


def topic_render(topic):
    terms = topic[0]
    result = []
    for i in range(wordNumbers):
        term = vocabArray[terms[i]]
        result.append(term)
    return result
Ejemplo n.º 34
0
    hashed_word = pd.DataFrame(hashed.collect(), columns=['hash','word']).set_index('hash')
    # hashingTF = HashingTF()
    # Tf-Idfの生成
    tf = hashingTF.transform(documents)
    tf.cache()
    idf = IDF().fit(tf)
    tf_idf_data = idf.transform(tf)
    print dt.now().strftime('%Y/%m/%d %H:%M:%S')
    K = 5


	# Index documents with unique IDs
    corpus_data = tf_idf_data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
    print corpus_data
	# Cluster the documents into three topics using LDA
    ldaModel = LDA.train(corpus_data, k=K)

	# Output topics. Each is a distribution over words (matching word count vectors)
    print "Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):"
    topics = ldaModel.topicsMatrix()
    print dt.now().strftime('%Y/%m/%d %H:%M:%S')
    def idx_to_word(idx):
        res = hashed_word.ix[idx].word
        if type(res) == pd.Series:
            return res.to_dict().values()[0]
        else:
            return res
    rep_num = 20

    for topic in range(K):
        print "Topic " + str(topic) + ":" 
Ejemplo n.º 35
0
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors

try:
    # create SparkContext on all CPUs available: in my case I have 4 CPUs on my laptop
    sc = ps.SparkContext('local[4]')
    print("Just created a SparkContext")
except ValueError:
    warnings.warn("SparkContext already exists in this scope")
# Load and parse the data
data = sc.textFile("spark_sample_data/sample_lda_data.txt")

parsedData = data.map(
    lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))

# Index documents with unique IDs
corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
print(corpus.take(10))

# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=3, optimizer='online')

# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " +
      str(ldaModel.vocabSize()) + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(3):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))
def toSparseVector(corpusLine, nFeatures):
    v = {idx: val for idx, val in corpusLine}
    return Vectors.sparse(nFeatures, v)

# nSamples = len(corpus)
nFeatures = len(dic)

corpusParallel = sc.parallelize(corpus)
corpusMapped = corpusParallel.map(lambda doc: toSparseVector(doc, nFeatures))

# Index documents with unique IDs
corpusIndexed = corpusMapped.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

nTopics = 10
ldaModel = LDA.train(corpusIndexed, k=nTopics)

# Dirty trick -- use sklearn LDA to do the transform step
# This should be possible on Spark, but can't figure out how
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_topics=nTopics, max_iter=1,
                                learning_method='online', learning_offset=50.
                                )
doc0 = corpusIndexed.first()[1].toArray()
lda.fit(doc0)
lda.components_ = ldaModel.topicsMatrix().T


def getDocumentTopics(docTokens, lda):
    wcTuples = dic.doc2bow(docTokens)
Ejemplo n.º 37
0
    # First, we're going to identify the top words in the corpus and only keep track of those words.
    # Those top words will form our vocabulary.
    word_counts = sentences.flatMap(lambda s: s.split(" ")).map(
        lambda w: (w.lower(), 1)).reduceByKey(lambda a, b: a + b)
    top_words = word_counts.takeOrdered(500, key=lambda (w, c): -c)
    vocabulary = [str(k) for (k, v) in top_words]

    # We also want a Broadcast version of the vocabulary list.
    br_vocabulary = sc.broadcast(vocabulary)

    # Next, we need to convert the raw text sentences into a dense-vector representation.
    dense_vectors = sentences.map(lambda s: vectorizer(s, br_vocabulary))

    # Finally, we create our corpus by giving each sentence an ID.
    corpus = dense_vectors.zipWithIndex().map(lambda (v, i): [i, v])

    # Now we can train an LDA model on our data.
    lda_model = LDA.train(corpus, k=3, maxIterations=20)

    # Output topics. For each topic, print out the top words contributing to that topic.
    print("Learned topics (as distributions over vocab of " +
          str(lda_model.vocabSize()) + " words):")
    topics = lda_model.topicsMatrix()
    for topic in range(topics.shape[1]):
        print("Topic " + str(topic) + ":")
        topic_word_counts = sorted(zip(vocabulary,
                                       lda_model.topicsMatrix()[:, topic]),
                                   key=lambda (w, c): -c)
        top_words = [w for (w, c) in topic_word_counts[:30]]
        print top_words
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="LatentDirichletAllocationExample")  # SparkContext

    # $example on$
    # Load and parse the items
    data = sc.textFile("/home/pipi/files/DATASETS/SparkMLlib/sample_lda_data.txt")
    parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
    # Index documents with unique IDs
    corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

    # Cluster the documents into three topics using LDA
    ldaModel = LDA.train(corpus, k=3)
    exit()

    # Output topics. Each is a distribution over words (matching word count vectors)
    print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize())
          + " words):")
    topics = ldaModel.topicsMatrix()
    for topic in range(3):
        print("Topic " + str(topic) + ":")
        for word in range(0, ldaModel.vocabSize()):
            print(" " + str(topics[word][topic]))

    # Save and load model
    ldaModel.save(sc, "target/org/apache/spark/PythonLatentDirichletAllocationExample/LDAModel")
    sameModel = LDAModel\
        .load(sc, "target/org/apache/spark/PythonLatentDirichletAllocationExample/LDAModel")
Ejemplo n.º 39
0
    'yyyy-MM-dd').alias('no_timestamp')).groupby('no_timestamp').count().sort(
        F.col('no_timestamp'))
print(dates.show(dates.count()))
dates.toPandas().plot(kind='line', x='no_timestamp', y='count')

dates.toPandas().plot(kind='bar', x='no_timestamp')

tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
prep_df = tokenizer.transform(df)
cv_prep = CountVectorizer(inputCol="words", outputCol="prep")
cv_model = cv_prep.fit(prep_df)
ready_df = cv_model.transform(prep_df)
# stopWords = [word for word in cv_prep.vocabulary if any(char.isdigit() for char in word)]
# remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords = stopwords)
# prep_df = remover.transform(prep_df)

trainable = ready_df.select(
    'tweet_id', 'prep').rdd.map(lambda x, y: [x, Vectors.fromML(y)]).cache()
print("Trainable")
print(trainable.take(10))
print("take")
model = LDA.train(trainable, k=5, seed=1, optimizer="online")
exit(0)
#Print the topics in the model
topics = model.describeTopics(maxTermsPerTopic=15)
for x, topic in enumerate(topics):
    print('topic nr: ' + str(x))
    words = topic[0]
    weights = topic[1]
    for n in range(len(words)):
        print(cv_prep.vocabulary[words[n]] + ' ' + str(weights[n]))
id_index = (clean
    .map(lambda (id, tokens): id)
    .zipWithIndex()
    .map(lambda (id, zID): (zID, id))
)


#-------------------------------------------------------------------------------
# Snippet 5: model


# LDA Model
lda_model = LDA.train(
    rdd = tf_matrix,
    k = num_topics,
    maxIterations = 50,
    seed = 1300,
    optimizer = 'em'
)
topics_matrix = sc.broadcast(lda_model.topicsMatrix())


# Document Topics
doc_topics = (tf_matrix
    .map(lambda (zID, dv): (zID, dv.dot(topics_matrix.value)))
    .map(lambda (zID, res): (zID, res * (1 / np.sum(res))))
    .join(id_index)
    .map(lambda (zID, (res, doc_id)): (doc_id, list(res)))
)

# Topic Terms
Ejemplo n.º 41
0
	sentences = tolstoy.filter(lambda s: len(s)>0)

	# We have a fair amount of data wrangling to do to get things into the right format for Spark's LDA. 

	# First, we're going to identify the top words in the corpus and only keep track of those words. 
	# Those top words will form our vocabulary.
	word_counts = sentences.flatMap(lambda s:  s.split(" ")).map(lambda w: (w.lower(),1)).reduceByKey(lambda a,b : a+b)
	top_words = word_counts.takeOrdered(500,key=lambda (w,c):-c)
	vocabulary = [str(k) for (k,v) in top_words]

	# We also want a Broadcast version of the vocabulary list.
	br_vocabulary = sc.broadcast(vocabulary)

	# Next, we need to convert the raw text sentences into a dense-vector representation.
	dense_vectors = sentences.map(lambda s: vectorizer(s,br_vocabulary))

	# Finally, we create our corpus by giving each sentence an ID.
	corpus = dense_vectors.zipWithIndex().map(lambda (v,i): [i, v]) 

	# Now we can train an LDA model on our data.
	lda_model = LDA.train(corpus, k =3, maxIterations=20)


	# Output topics. For each topic, print out the top words contributing to that topic.
	print("Learned topics (as distributions over vocab of " + str(lda_model.vocabSize()) + " words):")
	topics = lda_model.topicsMatrix()
	for topic in range(topics.shape[1]):
		print("Topic " + str(topic) + ":")    
		topic_word_counts = sorted(zip(vocabulary,lda_model.topicsMatrix()[:,topic]), key = lambda (w,c):-c )
		top_words = [w for (w,c) in topic_word_counts[:30]]
		print top_words
Ejemplo n.º 42
0
                               columns=['hash', 'word']).set_index('hash')
    # hashingTF = HashingTF()
    # Tf-Idfの生成
    tf = hashingTF.transform(documents)
    tf.cache()
    idf = IDF().fit(tf)
    tf_idf_data = idf.transform(tf)
    print dt.now().strftime('%Y/%m/%d %H:%M:%S')
    K = 5

    # Index documents with unique IDs
    corpus_data = tf_idf_data.zipWithIndex().map(
        lambda x: [x[1], x[0]]).cache()
    print corpus_data
    # Cluster the documents into three topics using LDA
    ldaModel = LDA.train(corpus_data, k=K)

    # Output topics. Each is a distribution over words (matching word count vectors)
    print "Learned topics (as distributions over vocab of " + str(
        ldaModel.vocabSize()) + " words):"
    topics = ldaModel.topicsMatrix()
    print dt.now().strftime('%Y/%m/%d %H:%M:%S')

    def idx_to_word(idx):
        res = hashed_word.ix[idx].word
        if type(res) == pd.Series:
            return res.to_dict().values()[0]
        else:
            return res

    rep_num = 20
Ejemplo n.º 43
0
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.clustering import LDA
import numpy as np
from numpy.testing import assert_almost_equal, assert_equal


sc = SparkContext('local[*]', appName='Word2Vec')
data = [
    [1, Vectors.dense([0.0, 1.0, 0.5])],
    [3, Vectors.dense([0.9, 1.2, 0.4])]]
rdd = sc.parallelize(data)
model = LDA.train(
    rdd,
    k=2,
    maxIterations=40,
    docConcentration=-1.0,
    topicConcentration=-1.0,
    seed=100,
    checkpointInterval=10,
    optimizer='em')
print model.vocabSize()
print model.topicsMatrix()

# topics = model.describeTopics(1)
topics = model.topicsMatrix()

for word in topics:
    print word

# topics_rdd = topics.rdd
# topics_words = topics_rdd\
Ejemplo n.º 44
0
p = rescaledData.select('features')
p = p.limit(650000)  # you can choose number or comments you want to run LDA on
#p.count()
#p.show(3)

import threading
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO,
                    filename='running.log',
                    filemode='w')

#Calculating LDA:
start = time()
lda = LDA(k=20, maxIter=500)
model = lda.fit(p)
print('used LDA: {:.2f}s'.format(time() - start))

#model.isDistributed()

#start = time()
#ll = model.logLikelihood(p)
#lp = model.logPerplexity(p)
#print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
#print("The upper bound on perplexity: " + str(lp))
#print ('used: {:.2f}s'.format(time()-start))

start = time()
# Describe topics.
topics = model.describeTopics(15)