/
lda.py
104 lines (85 loc) · 4.1 KB
/
lda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from collections import defaultdict
from pyspark import SparkContext
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.clustering import LDA, LDAModel
# from pyspark.sql import SQLContext
import re
import pandas as pd
num_of_stop_words = 50 # Number of most common words to remove, trying to eliminate stop words
num_topics = 2 # Number of topics we are looking for
num_words_per_topic = 10 # Number of words to display for each topic
max_iterations = 35 # Max number of times to iterate before finishing
# Initialize
sc = SparkContext('local', 'PySPARK LDA Example')
# sql_context = SQLContext(sc)
# Process the corpus:
# 1. Load each file as an individual document
# 2. Strip any leading or trailing whitespace
# 3. Convert all characters into lowercase where applicable
# 4. Split each document into words, separated by whitespace, semi-colons, commas, and octothorpes
# 5. Only keep the words that are all alphabetical characters
# 6. Only keep words larger than 3 characters
# data = sc.wholeTextFiles('newsgroup/files/*').map(lambda x: x[1])
path_to_data = 'result.txt'
print('Reading data...')
all_tweets_df = pd.read_table(path_to_data, names = ['ID', 'sentiment', 'tweet'])
data = list(all_tweets_df['tweet'])
data = sc.parallelize(data)
# with open('testdata.txt', 'r') as test:
# data = test.readlines()
# data = sc.parallelize(data)
tokens = data \
.map(lambda document: document.strip().lower()) \
.map(lambda document: re.split("[\s;,#]", document)) \
.map(lambda word: [x for x in word if x.isalpha()]) \
.map(lambda word: [x for x in word if len(x) > 3])
# Get our vocabulary
# 1. Flat map the tokens -> Put all the words in one giant list instead of a list per document
# 2. Map each word to a tuple containing the word, and the number 1, signifying a count of 1 for that word
# 3. Reduce the tuples by key, i.e.: Merge all the tuples together by the word, summing up the counts
# 4. Reverse the tuple so that the count is first...
# 5. ...which will allow us to sort by the word count
termCounts = tokens \
.flatMap(lambda document: document) \
.map(lambda word: (word, 1)) \
.reduceByKey(lambda x, y: x + y) \
.map(lambda tuple: (tuple[1], tuple[0])) \
.sortByKey(False)
# Identify a threshold to remove the top words, in an effort to remove stop words
threshold_value = termCounts.take(num_of_stop_words)[num_of_stop_words - 1][0]
# Only keep words with a count less than the threshold identified above,
# and then index each one and collect them into a map
vocabulary = termCounts \
.filter(lambda x: x[0] < threshold_value) \
.map(lambda x: x[1]) \
.zipWithIndex() \
.collectAsMap()
# Convert the given document into a vector of word counts
def document_vector(document):
id = document[1]
counts = defaultdict(int)
for token in document[0]:
if token in vocabulary:
token_id = vocabulary[token]
counts[token_id] += 1
counts = sorted(counts.items())
keys = [x[0] for x in counts]
values = [x[1] for x in counts]
return (id, Vectors.sparse(len(vocabulary), keys, values))
# Process all of the documents into word vectors using the
# `document_vector` function defined previously
documents = tokens.zipWithIndex().map(document_vector).map(list)
# Get an inverted vocabulary, so we can look up the word by it's index value
inv_voc = {value: key for (key, value) in vocabulary.items()}
# Open an output file
with open("new_output.txt", 'w') as f:
lda_model = LDA.train(documents, k=num_topics, maxIterations=max_iterations)
topic_indices = lda_model.describeTopics(maxTermsPerTopic=num_words_per_topic)
# Print topics, showing the top-weighted 10 terms for each topic
for i in range(len(topic_indices)):
f.write("Topic #{0}\n".format(i + 1))
for j in range(len(topic_indices[i][0])):
f.write("{0}\t{1}\n".format(inv_voc[topic_indices[i][0][j]] \
.encode('utf-8'), topic_indices[i][1][j]))
f.write("{0} topics distributed over {1} documents and {2} unique words\n" \
.format(num_topics, documents.count(), len(vocabulary)))