/
plotTextDetails.py
139 lines (109 loc) · 4.73 KB
/
plotTextDetails.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/python
import sys
import string
from nltk.util import tokenwrap
from nltk.probability import FreqDist
def word_filter(freq_dist, blacklist):
"""
Returns a dictionary of word-frequency pairs with stop words (such as "the" or "as") filtered out.
:param freq_dist: Frequency Distribution Object from nltk.
:type freq_dist: FreqDist
:param blacklist: List of words explicitly not to be shown. To be used if nltk provided list is missing some words.
Pass empty array if not needed.
:type blacklist: List
:rtype dict
"""
# Try to get list of stop words
try:
from nltk.corpus import stopwords
except ImportError:
raise ValueError(
'Could not import one of nltk.corpus or stopwords.'
)
stop_words = stopwords.words('english')
return dict((word, freq_dist[word]) for word in freq_dist if word.lower() not in stop_words and
word not in string.punctuation and word.isalnum() and word not in blacklist)
def plot_most_common(data, num=30, title='', y_label="Frequency", line_width=2):
# import nltk.text
# WordFreqDist = FreqDist()
# Try to import required libs
try:
from matplotlib import pylab
from operator import itemgetter
except ImportError:
raise ValueError(
'Error Importing MatPlotLib or operator.'
)
# Filter most frequent items, depending on value of "num" passed in. Default is 50.
reverse_most_frequent_items = sorted(data.items(), key=itemgetter(1))[-num:]
# The above returns highest frequency items but still lowest->highest. This flips to required order.
# Note: This second step is required, as if reversed=True is put in the first sort, it gives values in least.
most_frequent_items = sorted(reverse_most_frequent_items, key=itemgetter(1), reverse=True)
# Frequencies of Items
frequencies = [item[1] for item in most_frequent_items]
items = [item[0] for item in most_frequent_items]
# Customize Graph and Show
if title:
pylab.title(title)
pylab.plot(frequencies, linewidth=line_width)
pylab.ylabel(y_label)
pylab.grid(True, color="silver")
pylab.xticks(range(len(items)), items, rotation=90)
pylab.show()
def find_collocations(parsed_book, blacklist, num, window_size):
# Try to get list of stop words
try:
import nltk.text
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder
except ImportError:
raise ValueError(
'Could not import one of the requirements.'
)
ignored_words = stopwords.words('english')
bi_gram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(parsed_book, window_size)
finder.apply_freq_filter(2)
finder.apply_word_filter(lambda word: len(word) < 3 or word.lower() in ignored_words or word.lower() in blacklist)
collocated_strings = finder.nbest(bi_gram_measures.likelihood_ratio, num)
return [first_word + ' ' + second_word for first_word, second_word in collocated_strings]
def main():
# Parse Book into Array
parsed_book = open(str(sys.argv[1])).read().split()
# Default Values and Parsing Input Values
# Graph Values
num_points = 30
if "--numPoints" in sys.argv:
num_points = sys.argv[sys.argv.index("--numPoints") + 1]
y_label = "Frequencies"
line_width = 3
title = "Top " + str(num_points) + " Useful Words For " + str(sys.argv[1])
if "--title" in sys.argv:
title = sys.argv[sys.argv.index("--title") + 1]
if "--yLabel" in sys.argv:
y_label = sys.argv[sys.argv.index("--yLabel") + 1]
if "--lineWidth" in sys.argv:
line_width = sys.argv[sys.argv.index("--lineWidth") + 1]
# Stop Words Values
blacklist = []
if "--blacklist" in sys.argv:
blacklist = sys.argv[sys.argv.index("--blacklist") + 1].replace(" ", "").split(',')
# Collocations Values
num_collocations = 20
if "--numCollocations" in sys.argv:
num_collocations = sys.argv[sys.argv.index("--numCollocations") + 1]
window_size = 4
if "--windowSize" in sys.argv:
window_size = sys.argv[sys.argv.index("--windowSize") + 1]
# Collocations
# Paper Explaining The Math
# https://nlp.stanford.edu/fsnlp/promo/colloc.pdf
print("Words Commonly Used Together:")
print(tokenwrap(find_collocations(parsed_book, blacklist, num=int(num_collocations), window_size=int(window_size)),
separator=" ; "))
# Filter out Stop Words
filtered_freq_dist = word_filter(FreqDist(parsed_book), blacklist)
# Plot
plot_most_common(filtered_freq_dist, int(num_points), title, y_label, int(line_width))
if __name__ == "__main__":
main()