/
hosts_presenters_preprocessing.py
170 lines (147 loc) · 7.17 KB
/
hosts_presenters_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import json
import nltk
import operator
import sys
import re
from winners import find_winners
#amar shah
host_keywords = ["host", "hosting", "hosts", "hosted"]
presenters_keywords = ["presenting", "presenters", "Presenters", "Presenting"]
#accounts for other words as well
punctuation_stopword = [".", '"', ",", "?", "!", "/", "'", "-", "_", ";", ":", "&", ',"', '",', ")", "(", "Golden", "Globes", "@", "GoldenGlobes", "I", "we", "http", "://", "/", "co", "Hollywood", "Hooray"]
stopwords = nltk.corpus.stopwords.words('english') + punctuation_stopword
humor_keywords = ["hilarious", "funny", "comedian", "best joke", "hysterical"]
#winners = {'cecil b. demille award' : 'Jodie Foster', 'best motion picture - drama' : 'Argo', 'best performance by an actress in a motion picture - drama' : 'Jessica Chastain', 'best performance by an actor in a motion picture - drama' : 'Daniel Day-Lewis', 'best motion picture - comedy or musical' : 'Les Miserables', 'best performance by an actress in a motion picture - comedy or musical' : 'Jennifer Lawrence', 'best performance by an actor in a motion picture - comedy or musical' : 'Hugh Jackman', 'best animated feature film' : 'Brave', 'best foreign language film' : 'Amour', 'best performance by an actress in a supporting role in a motion picture' : 'Anne Hathaway', 'best performance by an actor in a supporting role in a motion picture' : 'Christoph Waltz', 'best director - motion picture' : 'Ben Affleck', 'best screenplay - motion picture' : 'Quentin Tarantino', 'best original score - motion picture' : 'Mychael Danna', 'best original song - motion picture' : 'Skyfall', 'best television series - drama' : 'Homeland', 'best performance by an actress in a television series - drama' : 'Claire Danes', 'best performance by an actor in a television series - drama' : 'Damian Lewis', 'best television series - comedy or musical' : 'Girls', 'best performance by an actress in a television series - comedy or musical':'Lena Dunham', 'best performance by an actor in a television series - comedy or musical':'Don Cheadle', 'best mini-series or motion picture made for television':'Game Change', 'best performance by an actress in a mini-series or motion picture made for television':'Julianne Moore', 'best performance by an actor in a mini-series or motion picture made for television':'Kevin Costner', 'best performance by an actress in a supporting role in a series, mini-series or motion picture made for television': 'Maggie Smith', 'best performance by an actor in a supporting role in a series, mini-series or motion picture made for television': 'Ed Harris'}
def loadParsedTweets(filename):
try:
with open(filename) as fl:
jsonObj = json.load(fl)
tweets = [tweet["text"] for tweet in jsonObj]
except (MemoryError, ValueError):
with open(filename) as fl:
tweets = [json.loads(line)["text"] for line in fl]
parsedTweets = [nltk.wordpunct_tokenize(tweet) for tweet in tweets]
for tweet in parsedTweets:
for token in tweet:
if token in stopwords:
tweet.remove(token)
return parsedTweets
def lookthroughTweets(keywords, year):
if str(year) == '2015':
filename = 'gg2015.json'
else:
filename = 'gg2013.json'
parsedTweets = loadParsedTweets(filename)
fTweets = [tweet for tweet in parsedTweets if any(x in tweet for x in keywords)]
#print fTweets
return fTweets
def lookthroughTweets2(presentersTweets,test1):
fTweets = [tweet for tweet in presentersTweets if any(x in tweet for x in test1)]
#print fTweets
return fTweets
def CreateWords(tweets,stopwords):
dictofwords = dict()
for tweet in tweets:
for word in tweet:
if word[0].isupper():
if word not in stopwords:
if word not in dictofwords:
dictofwords[word] = 0
dictofwords[word] += 1
#print dictofwords
return dictofwords
def CreateNames(tweets,stopwords):
RegexNames = dict()
for tweet in tweets:
tweetnames = re.findall('([A-Z][a-z]+(?:\s[A-Z][a-z]+)*)'," ".join(tweet))
for name in tweetnames:
if name not in stopwords:
if name not in RegexNames:
RegexNames[name] = 0
RegexNames[name] += 1
#print RegexNames
return RegexNames
def ObtainNames(worddict,namedict,num):
finalnames = dict()
twords = worddict.copy()
tnames = namedict.copy()
for i in range(num):
dispName = ""
counter = -1
for word in sorted(twords, key=twords.get, reverse=True):
for name in sorted(tnames, key=tnames.get, reverse=True):
if word in name:
if tnames[name] + twords[word] > counter:
counter = tnames[name] + twords[word]
dispName = name
finalnames[dispName] = counter
#print finalnames
return finalnames.keys()
def getHosts(year):
hostTweets = lookthroughTweets(host_keywords, year)
wordDict = CreateWords(hostTweets, stopwords)
nameList = CreateNames(hostTweets, stopwords)
hosts = ObtainNames(wordDict, nameList, 1)
output = []
for host in hosts:
h_final = host.encode("utf-8")
output.append(h_final)
hosts = output
hosts = output[0].split()
newHosts = []
for num in range(0,len(hosts), 2):
newHosts.append(hosts[num] + ' ' + hosts[num + 1])
#print newHosts
return newHosts
def getHumor(year):
humorTweets = lookthroughTweets(humor_keywords, year)
wordDict = CreateWords(humorTweets, stopwords)
nameList = CreateNames(humorTweets, stopwords)
comedians = ObtainNames(wordDict, nameList, 1)
output = []
for comedian in comedians:
c_final = comedian.encode("utf-8")
output.append(c_final)
comedians = output
#comedians.replace("Show ", "")
print comedians[0]
return comedians
def getPresenters(year):
winners = find_winners(year)
presenters = dict()
presentersTweets = lookthroughTweets(presenters_keywords, year)
for w in winners:
test = []
#presenters_keywords.append(winners[w])
#presentersTweets = lookthroughTweets(presenters_keywords)
test = winners[w].split()
#print test
specificTweets = lookthroughTweets2(presentersTweets, test)
#print specificTweets
for t in test:
stopwords.append(t)
#print stopwords
wordDict = CreateWords(specificTweets, stopwords)
nameList = CreateNames(specificTweets, stopwords)
presentersper = ObtainNames(wordDict, nameList, 4)
output = []
for presenter in presentersper:
presenters_final = presenter.encode("utf-8")
output.append(presenters_final)
presentersper = output
presentersper = output
presenters[w] = presentersper
#print presentersper
#presenters.key() = winners.key()
pop_amount = len(test)
for x in range(0, pop_amount):
stopwords.pop()
print presenters
return presenters
##
##getHosts("2013")
##getHosts("2015")
#getPresenters("2013")
#getHumor("2013")
if __name__ == '__main__':
main()