-
Notifications
You must be signed in to change notification settings - Fork 0
/
WhoSampledScraper.py
200 lines (154 loc) · 6.17 KB
/
WhoSampledScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
"""
File: WhoSampledScraper.py
Author: Brad Jobe
Version: 0.0.1
Scrapes relevant HTML content from an artist's track page
"""
import sys
import eyed3
import urllib2
from lxml import html
import httplib
import json
class WhoSampledScraper:
URL_WHOSAMPLED = "www.whosampled.com"
HTTP_PROTO = "http://"
HTTP_REDIRECT = "3"
SONGS_SAMPLED = 0
WHO_SAMPLED = 2
SONGS_SAMPLED_CALL = "songsSampled"
WHO_SAMPLED_CALL = "whoSampled"
def __init__(self, songLoc):
"""
Parses a songfile for the artist and title ID3 tags and creates the
theoretical path for the songfile's samplepage.
Param: The directory path to a song file, as a string.
Throws: MissingTagException if tag(s) could not be found.
"""
songfile = eyed3.load(songLoc)
try:
self.whoSampledHTML = None
self.artistName = songfile.tag.artist
self.songTitle = songfile.tag.title
self.sampleJSON = {}
if self.artistName == None or self.songTitle == None:
raise MissingTagException()
except MissingTagException:
print "audiofile at " + songLoc + " has missing tag information"
self.whoSampledPath = ("/" + self.artistName + "/" + \
self.songTitle + "/").replace(" ", "-")
self.sampleJSON[self.whoSampledPath] = { self.SONGS_SAMPLED_CALL:{}, \
self.WHO_SAMPLED_CALL: {} }
def getSongsSampled(self):
"""
Returns a list of songs that were sampled in the given track.
"""
jsonSamples = self.sampleScraper(self.SONGS_SAMPLED_CALL)
return self.convertJsontoList(jsonSamples)
def getWhoSampled(self):
"""
Returns a list of songs that have used the given track as a sample.
"""
jsonSamples = self.sampleScraper(self.WHO_SAMPLED_CALL)
return self.convertJsontoList(jsonSamples)
def getHTMLFromPath(self):
"""
Returns the html content from the song's sample page.
Throws: RedirectException if the url is redirected away from the
predicted path of the songs's sample page.
"""
urlCheck = urllib2.urlopen(self.HTTP_PROTO + \
self.URL_WHOSAMPLED + \
self.whoSampledPath)
try:
if urlCheck.geturl().lower() != (self.HTTP_PROTO + \
self.URL_WHOSAMPLED + \
self.whoSampledPath).lower():
raise RedirectException()
except RedirectException:
print "The URL of " + self.songTitle + " by " + self.artistName + \
" was redirected."
return None
return urlCheck.read()
def sampleScraper(self, calltype):
"""
Scrapes sample data from the song's sample page.
Params: a string of specifying what type of sample data is to be
scraped from the sample page.
Returns: a list of song samples, as strings, or an empty list.
"""
self.cachedSamples = self.loadCachedSampleData()
try:
self.cachedSamples[self.whoSampledPath] == None
except KeyError:
self.sampleJson = self.searchForSampleData(calltype)
else:
self.sampleJson = self.cachedSamples[self.whoSampledPath][calltype]
return self.sampleJson
def searchForSampleData(self, calltype):
"""
loads html of artist's track page on WhoSampled.com and parses it for
the relevant sample data.
args: specific type of sample data to parse for
returns: None if sample data could not be found, returns sample data
in json format if successful page parse
"""
if self.whoSampledHTML == None:
self.whoSampledHTML = self.getHTMLFromPath()
if self.whoSampledHTML == None:
return None
splitHTML = self.whoSampledHTML.split("<span Was sampled")
if calltype == self.SONGS_SAMPLED_CALL:
whoSampledDoc = html.document_fromstring( \
splitHTML[self.SONGS_SAMPLED])
elif calltype == self.WHO_SAMPLED_CALL and len(splitHTML) > 1:
whoSampledDoc = html.document_fromstring( \
splitHTML[self.WHO_SAMPLED])
elif len(splitHTML) <= 1:
return None
artistNamesSamples = whoSampledDoc.find_class("trackArtist")
songTitlesSamples = whoSampledDoc.find_class("trackName")
if len(artistNamesSamples) != len(songTitlesSamples) \
or len(artistNamesSamples) < 1:
return None
for i in range(0, len(artistNamesSamples)):
a = artistNamesSamples[i].text_content()
s = songTitlesSamples[i].text_content()
self.sampleJSON[self.whoSampledPath][calltype][a] = s
self.cacheSampleData()
return self.sampleJSON
def loadCachedSampleData(self):
"""
loads stored sample data from previous lookups
returns: json sample data
"""
with open("samples.json", "r") as inSampleFile:
jsonData = json.load(inSampleFile)
inSampleFile.close()
return jsonData
def cacheSampleData(self):
"""
stores sample data that has not been previously cached
"""
self.cachedSamples[self.whoSampledPath] \
= self.sampleJSON[self.whoSampledPath]
with open('samples.json', 'w') as outSampleFile:
json.dump(self.cachedSamples, outSampleFile)
outSampleFile.close()
def convertJsontoList(self, jsonSampleData):
"""
converts JSON sampled data to a python list
args: json to be converted
returns: python list of converted data
"""
sampleList = []
sampleDict = jsonSampleData
if bool(sampleDict) == False:
return None
for key in sampleDict:
sampleList.append(str(sampleDict[key]) + " " + str(key))
return sampleList
class RedirectException(Exception):
pass
class MissingTagException(Exception):
pass