/
tagcloud_builder.py
83 lines (70 loc) · 2.32 KB
/
tagcloud_builder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#! /usr/bin/env python
import sys
import nltk
import os
import fileinput
import datetime
from operator import itemgetter
from pytagcloud import create_tag_image, make_tags
class TagCloudBuilder:
city = ''
state = ''
wordcount = 100
txt_directory = ''
img_directory = 'imgs'
tagcloud = dict()
def __init__(self,in_city,in_state):
self.city = in_city.replace(' ', '')
self.state = in_state
self.txt_directory = 'txts/{0}/{1}'.format(self.state,self.city)
files = self.get_file_list(self.txt_directory)
self.parse(files)
self.remove_ignored_words()
#self.write_to_file()
self.build_pytag_cloud()
# StackOverflow
def get_file_list(self, directory):
return ['{0}/{1}'.format(directory,f) for f in os.listdir(directory) if os.path.isfile(os.path.join(directory,f))]
# http://www.mhermans.net/from-pdf-to-wordcloud-using-the-python-nltk.html
def parse(self, files):
for file in files:
f = open(file, 'rU')
txt = f.read()
f.close()
tokens = nltk.word_tokenize(txt) # tokenize text
clean_tokens = []
for word in tokens:
word = word.lower()
if word.isalpha(): # drop all non-words
clean_tokens.append(word)
# make frequency distribution of words
fd = nltk.FreqDist(clean_tokens)
for token in fd:
self.add_to_dictionary(token.lower(), fd[token])
def add_to_dictionary(self, token, count):
if token in self.tagcloud:
self.tagcloud[token] += count
else:
self.tagcloud[token]=count
def remove_ignored_words(self):
f = open('ignore-words.txt', 'rU')
for line in f.readlines():
key = line[:-1] # remove newline
if key in self.tagcloud:
del self.tagcloud[key]
f.close()
def write_to_file(self):
f = open('taglist.txt', 'wa')
f.write(str(self.tagcloud))
f.close()
def build_pytag_cloud(self):
width = 900
height = 575
fileName = '{0}/{1}.{2}.{3}.{4}.png'.format(self.img_directory, self.state, self.city, width, height)
items = sorted(self.tagcloud.iteritems(), key=itemgetter(1), reverse=True)
tags = make_tags(items[:self.wordcount], maxsize=80)
create_tag_image(tags, fileName, size=(width, height), fontname='Droid Sans')
import webbrowser
webbrowser.open(fileName) # see results
if __name__ == '__main__':
TagCloudBuilder()