forked from almosteverywhere/fuzzy-nemesis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
common_words_pattern.py
executable file
·189 lines (149 loc) · 5.85 KB
/
common_words_pattern.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/usr/bin/python
import pattern.web
import re, sys, os
from operator import itemgetter
from pattern.web import plaintext, DOM
import pattern.vector as vec
rap_exclude_words = ['1','2', '3', 'x2', 'ay', 'hey', 'uhh', '\'mon', 'cuz', 'c', '\'mma', 'kanye', 'west', 'yo', 't', 'oh', 'uh', 'ya', 'yea', 'la', 'gon', 'cause', 'em', 'yeah', '50', 'cent']
# ok, just write a thing that opens the directory with all the artists and prints out the directory
# names
basedir = "/Users/julielavoie/projects/pycon/files/"
common_words_file = "/Users/julielavoie/projects/pycon/results/all"
results_basedir = "/Users/julielavoie/projects/pycon/results/"
stop_names_file = "/Users/julielavoie/projects/pycon/stop_names"
def get_common_words(file):
common = []
f = open(file)
for line in f:
common.append(re.split("\s", line)[1].strip())
return common
# put all the rapper names into a file
def make_exclude_names():
f = open(stop_names_file, 'w')
for dir in os.listdir(basedir):
# dir names are the name of the artist, separated by -, like jay-z
name1 = re.split('-', dir)
for n in name1:
f.write(n)
f.write("\n")
print n
f.close()
def load_exclude_names():
global rap_exclude_words
f = open(stop_names_file)
for line in f:
rap_exclude_words.append(line.strip())
def get_artist_docs(name):
default_dir = basedir + name
rap_docs = ""
# get a list of all the files in default dir
for f in os.listdir(default_dir):
# go to that dir
os.chdir(default_dir)
# open the file
fi = open(f, 'r')
# print "reading " + f
# slurp
page = fi.read()
# what does this do?
dom = DOM(page)
# we look at the page and get that the thing we want is in the .lyrics div.
if dom and dom('.lyrics'):
lyrics = dom('.lyrics')[0]
else:
continue
p = plaintext(lyrics.content)
rap_docs += p
return rap_docs
# . questions we have, most common 20 words over ALL artists
# . what are words that appear only for that artist? like mad for biggie or flavor for queen latifah
# . what are words that appear only for eminem or biggie small?
# import pdb; pdb.set_trace()
# Kanye_corpus = vec.M(rap_docs, weight=vec.TFIDF)
# pass in a dict of bad words, return a dict of word counts
def count_one_artist(name, bad_words):
# ok, this is a bad way to get number of songs for that artist, so we can average out
# the words per song
default_dir = basedir + name
num_songs = len(os.listdir(default_dir))
# we need the number of songs, this is so annoying
dict = {}
docs = vec.count(vec.words(get_artist_docs(name)))
for w in bad_words:
if w in docs:
dict[w] = docs[w]
dict['num_songs'] = num_songs # this is cheap
return dict
def get_one_artist_excluding_common(name):
global rap_exclude_words
common_exclude_words = get_common_words(common_words_file)
rap_exclude_words += common_exclude_words
# ok this is the worse way to do this
load_exclude_names()
docs = get_artist_docs(name)
corpus = vec.Document(docs, exclude=rap_exclude_words, stop_words=False)
print "for artist: " + name
for ln in corpus.keywords(top=20): print "%0.08f\t%s" % ln
def get_each_artist_excluding_common():
global rap_exclude_words
common_exclude_words = get_common_words(common_words_file)
rap_exclude_words += common_exclude_words
for dir in os.listdir(basedir):
if dir != '.git':
# push the name of the artist onto the exclude words list
name1 = re.split('-', dir)
for n in name1:
rap_exclude_words.append(n)
docs = get_artist_docs(dir)
results_file = results_basedir + dir
f = open(results_file, 'w')
corpus = vec.Document(docs, exclude=rap_exclude_words, stop_words=False)
f.write("for artist: " + dir)
print "for artist: " + dir
f.write("\n")
for ln in corpus.keywords(top=20):
f.write("%0.08f\t%s" % ln)
print "%0.08f\t%s" % ln
f.close()
# print the results for the top 20 words to a file
def get_one_artist_all_words(name):
docs = ""
name1 = re.split('-', name)
for n in name1:
rap_exclude_words.append(n)
print "excluding" + n
docs = get_artist_docs(name)
corpus = vec.Document(docs, exclude=rap_exclude_words, stop_words=False)
for ln in corpus.keywords(top=20): print "%0.08f\t%s" % ln
def get_all_artists_all_words():
docs = ""
for dir in os.listdir(basedir):
print "artist" + dir
if dir != '.git':
# push the name of the artist onto the exclude words list
name1 = re.split('-', dir)
for n in name1:
rap_exclude_words.append(n)
# import pdb; pdb.set_trace()
docs += get_artist_docs(dir)
corpus = vec.Document(docs, exclude=rap_exclude_words, stop_words=False)
for ln in corpus.keywords(top=20): print "%0.08f\t%s" % ln
def get_all_artists_all_words_to_file():
f = open(common_words_file, "w")
docs = ""
for dir in os.listdir(basedir):
print "artist" + dir
if dir != '.git':
# push the name of the artist onto the exclude words list
name1 = re.split('-', dir)
for n in name1:
rap_exclude_words.append(n)
docs += get_artist_docs(dir)
corpus = vec.Document(docs, exclude=rap_exclude_words, stop_words=False)
for ln in corpus.keywords(top=20):
f.write("%0.08f\t%s" % ln)
print "%0.08f\t%s" % ln
f.close()
if __name__ == '__main__':
if sys.argv[1:]:
name = sys.argv[1]