/
MusicRecommender.py
133 lines (101 loc) · 3.52 KB
/
MusicRecommender.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import json, re, sys, os
import musicbrainzngs as mbz
mbz.set_useragent('course-project', '1.0', 'zcarciu@albany.edu')
def main():
if len(sys.argv) != 2:
print "Usage: \n$ python MusicRecommender.py <raw-tweets-file>"
sys.exit()
else:
recommender = Recommender(sys.argv[1])
try:
foo = recommender.make_association_rules()
except mbz.musicbrainz.ResponseError:
print "A response error has occurred!"
class Recommender:
def __init__(self, raw_tweet_file_name):
self.raw_tweet_file = raw_tweet_file_name
########### make folder to keep data
########### and intermediate files
if not os.path.exists('data'):
os.mkdir('data')
raw_tweet_file = sys.argv[1]
def clean_raw_tweets(self):
print "Cleaning raw tweets from", self.raw_tweet_file
cleaned_tweets = []
tweet_count = 0
with open(self.raw_tweet_file, 'r') as f:
for line in f.readlines():
if "****" not in line:
new_tweet = dict()
try:
tweet = json.loads(line)
tweet['text'] = re.sub('https:.*|&', '', tweet['text'])
tweet['text'] = re.sub('(?i)#nowplaying|(?i)#iheartradio', '', tweet['text'])
tweet['text'] = re.sub('(?i)#listeningto|(?i)@tunein', '', tweet['text'])
new_tweet['text'] = tweet['text']
new_tweet['userID'] = tweet['user']['id']
new_tweet['tweetID'] = tweet['id']
cleaned_tweets.append(new_tweet)
tweet_count += 1
print tweet_count
except (KeyError, ValueError):
pass
print "Read/Cleaned in %s tweets" % (tweet_count)
return cleaned_tweets, tweet_count
############ cleaned_tweets is a list of json entries
############ cleaner_tweets will have songs assigned
############ to tweets
def assign_songs(self, cleaned_tweets):
print "Matching songs to tweets"
cleaner_tweets = []
tweets_used = 0
for cleaned_tweet in cleaned_tweets:
if " by " in cleaned_tweet['text']:
split_text = cleaned_tweet['text'].split(' by ')
artists_list = mbz.search_artists(split_text[0])
try:
mbID = artists_list['artist-list'][0]['id']
works = mbz.search_recordings(query=split_text[0],
arid=mbID)
tweet = dict()
tweet['song'] = works['recording-list'][0]['title']
tweet['text'] = cleaned_tweet['text']
tweet['artist'] = artists_list['artist-list'][0]['name']
tweet['musicbrainzID'] = works['recording-list'][0]['id']
tweet['userID'] = cleaned_tweet['userID']
cleaner_tweets.append(tweet)
tweets_used += 1
except (UnicodeEncodeError, IndexError):
print "Unable to find song/artist"
print "Tweets used: %s" % (tweets_used)
return cleaner_tweets
def assign_user_info(self, cleaner_tweets):
user_songs = dict()
user_artists = dict()
for tweet in cleaner_tweets:
userID = tweet['userID']
if user_songs.has_key(userID):
user_songs[userID].append(tweet['song'])
else:
user_songs[userID] = [tweet['song']]
if user_artists.has_key(userID):
user_artists[userID].append(tweet['artist'])
else:
user_artists[userID] = [tweet['artist']]
users = []
user = dict()
for key in user_songs.keys():
user['userID'] = key
user['songs'] = user_songs[key]
user['artists'] = user_artists[key]
users.append(user)
return users
def make_association_rules(self):
cleaned_tweets, tweet_count = self.clean_raw_tweets()
cleaner_tweets = self.assign_songs(cleaned_tweets)
users = self.assign_user_info(cleaner_tweets)
print users
##########################
####### association rules code will go here
if __name__ == '__main__':
main()