class WaveformSpider(scrapy.Spider): name = 'waveform' start_urls = ['https://soundcloud.com/'] def __init__(self): self.config = util.load_config() util.register_gcp_credential(self.config) self.target_ids = util.load_target_ids('target.txt') self.dbhandler = DBHandler(self.config) self.gcphandler = GCPHandler(self.config) warnings.filterwarnings(action='ignore') def parse(self, response): track_infos = self.dbhandler.select_all_track_ids() for track_info in track_infos: track_id = track_info[0] url = f'https://api-v2.soundcloud.com/tracks/{track_id}?client_id={self.config["CLIENT_ID"]}' req = scrapy.Request(url, self.parse_track) req.meta['track_id'] = track_id yield req def parse_track(self, response): track_id = response.meta['track_id'] track_json = json.loads(response.body) new_url = track_json['waveform_url'] req = scrapy.Request(url=new_url, callback=self.parse_waveform) req.meta['track_id'] = track_id yield req def parse_waveform(self, response): track_id = response.meta['track_id'] waveform = json.loads(response.body) self.dbhandler.insert_waveform(track_id, waveform)
def __init__(self): self.config = util.load_config() util.register_gcp_credential(self.config) self.target_ids = util.load_target_ids('target.txt') self.dbhandler = DBHandler(self.config) self.gcphandler = GCPHandler(self.config) warnings.filterwarnings(action='ignore')
def __init__(self): self.config = util.load_config() util.register_gcp_credential(self.config) self.target_ids = util.load_target_ids('target.txt') self.dbhandler = DBHandler(self.config) self.user_sids = self.dbhandler.select_user_sids(self.target_ids) self.re_quoted = re.compile(r'\"(.+?)\"')
def __init__(self): warnings.simplefilter("ignore") self.config = util.load_config() util.register_gcp_credential(self.config) self.target_ids = util.load_target_ids('target.txt') self.dbhandler = DBHandler(self.config) self.gcphandler = GCPHandler(self.config)
class TrackColorSpider(scrapy.Spider): name = 'track_color' start_urls = ['https://soundcloud.com/'] def __init__(self): warnings.simplefilter("ignore") self.config = util.load_config() util.register_gcp_credential(self.config) self.target_ids = util.load_target_ids('target.txt') self.dbhandler = DBHandler(self.config) self.gcphandler = GCPHandler(self.config) def parse(self, response): track_infos = self.dbhandler.select_all_track_ids_thumbnail() for track_id, track_artwork_thumbnail in track_infos: if not track_artwork_thumbnail: continue track_artwork_thumbnail = track_artwork_thumbnail.replace( 'https://storage.googleapis.com/', 'gs://') headers = { 'Authorization': f'Bearer {self.config["VISION_TOKEN"]}', 'Content-Type': 'application/json; charset=utf-8' } payload = { "requests": [{ "image": { "source": { "gcsImageUri": track_artwork_thumbnail } }, "features": [{ "maxResults": 1, "type": "IMAGE_PROPERTIES" }] }] } response = requests.post( url='https://vision.googleapis.com/v1/images:annotate', data=json.dumps(payload), headers=headers) try: color_info = response.json() color_info = color_info['responses'][0][ 'imagePropertiesAnnotation']['dominantColors']['colors'][ 0]['color'] track_color = f'{color_info["red"]}|{color_info["green"]}|{color_info["blue"]}' self.dbhandler.update_track_color(track_id, track_color) except: print('exception: ', track_artwork_thumbnail)
class TrackUserProfileSpider(scrapy.Spider): name = 'track_user_profile' start_urls = ['https://soundcloud.com/'] def __init__(self): warnings.simplefilter("ignore") self.config = util.load_config() util.register_gcp_credential(self.config) self.dbhandler = DBHandler(self.config) def parse(self, response): user_infos = self.dbhandler.select_all_users() for user_info in user_infos: user_tracks = self.dbhandler.select_user_tracks(user_info[0]) for user_track in user_tracks: self.dbhandler.update_track_user_profile(user_track[0], user_info[1], user_info[2]) print(user_track[0], user_info[1], user_info[2])
class TagSpider(scrapy.Spider): name = 'tag' start_urls = ['https://soundcloud.com/'] def __init__(self): self.config = util.load_config() util.register_gcp_credential(self.config) self.target_ids = util.load_target_ids('target.txt') self.dbhandler = DBHandler(self.config) self.user_sids = self.dbhandler.select_user_sids(self.target_ids) self.re_quoted = re.compile(r'\"(.+?)\"') def parse(self, response): for user_sid in self.user_sids: track_infos = self.dbhandler.select_track_permanlink(user_sid) for track_id, track_permalink in track_infos: track_req = scrapy.Request(track_permalink, self.parse_track_page) track_req.meta['track_id'] = track_id yield track_req def parse_track_page(self, response): track_id = response.meta['track_id'] trackpage_soup = BeautifulSoup(response.body, 'lxml') script = str(trackpage_soup.find_all('script')[-1]) script = script.split('"data":[')[-1].replace(']}]);</script>', '') track_info = json.loads(script) tags_str = track_info['tag_list'] tags_str = tags_str.replace('soundcloud:source=android-record', '') quoted_tags = [ x.strip() for x in re.findall(self.re_quoted, tags_str) if x ] simple_tags = [ x.strip() for x in re.sub(self.re_quoted, '', tags_str).split(' ') if x ] tags = quoted_tags + simple_tags if tags: self.dbhandler.insert_tags(track_id, tags)
class TrackPopularitySpider(scrapy.Spider): name = 'track_popularity' start_urls = ['https://soundcloud.com/'] def __init__(self): warnings.simplefilter("ignore") self.config = util.load_config() util.register_gcp_credential(self.config) self.target_ids = util.load_target_ids('target.txt') self.dbhandler = DBHandler(self.config) self.gcphandler = GCPHandler(self.config) def parse(self, response): user_sids = self.dbhandler.select_user_sids(self.target_ids) url_head = "https://api-v2.soundcloud.com/users/{0}" url_tail = f"/tracks?representation=&client_id={self.config['CLIENT_ID']}&limit=20&offset=0&linked_partitioning=1&app_version=1593604665&app_locale=en" for user_sid in user_sids: url = url_head.format(user_sid) + url_tail req = scrapy.Request(url, self.parse_track_popularity) yield req def parse_track_popularity(self, response): result_json = json.loads(response.body) collections = result_json['collection'] if not collections: return for collection in collections: track_likes_count = collection['likes_count'] if collection[ 'likes_count'] else 0 track_playback_count = collection['playback_count'] if collection[ 'playback_count'] else 0 track_id = collection['id'] self.dbhandler.update_track_popularity(track_id, track_likes_count, track_playback_count) if result_json['next_href']: url = result_json[ 'next_href'] + f'&client_id={self.config["CLIENT_ID"]}' req = scrapy.Request(url, self.parse_track_popularity) yield req
class TrackNoHlsSpider(scrapy.Spider): name = 'track_nohls' start_urls = ['https://soundcloud.com/'] def __init__(self): warnings.simplefilter("ignore") self.config = util.load_config() util.register_gcp_credential(self.config) self.target_ids = util.load_target_ids('target.txt') self.dbhandler = DBHandler(self.config) self.gcphandler = GCPHandler(self.config) self.track_nohls_ids = self.dbhandler.select_track_nohls() def parse(self, response): user_sids = self.dbhandler.select_all_user_sids() url_head = "https://api-v2.soundcloud.com/users/{0}" url_tail = f"/tracks?representation=&client_id={self.config['CLIENT_ID']}&limit=20&offset=0&linked_partitioning=1&app_version=1593604665&app_locale=en" for user_sid in user_sids: url = url_head.format(user_sid) + url_tail req = scrapy.Request(url, self.parse_tracks) req.meta['user_sid'] = user_sid yield req def parse_tracks(self, response): user_sid = response.meta['user_sid'] result_json = json.loads(response.body) collections = result_json['collection'] if not collections: return for collection in collections: m3u8_url = '' try: m3u8_url = collection['media']['transcodings'][0]['url'] except: pass track_json = { 'created_at': collection['created_at'], 'track_id': collection['id'], 'track_user_sid': user_sid, 'track_title': collection['title'] if collection['title'] else '', 'track_description': collection['description'] if collection['description'] else '', 'track_duration': collection['duration'], 'track_genre': collection['genre'] if collection['genre'] else '', 'track_permalink': collection['permalink_url'] if collection['permalink_url'] else '', 'track_likes_count': collection['likes_count'] if collection['likes_count'] else 0, 'track_playback_count': collection['playback_count'] if collection['playback_count'] else 0, 'track_user_id': collection['user']['permalink'] if collection['user']['permalink'] else '', 'track_user_name': collection['user']['username'] if collection['user']['username'] else '', } track_hls = self.dbhandler.select_hls_per_track( track_json['track_id']) if not track_hls: continue if not track_hls[0][0] and m3u8_url: m3u8_req = scrapy.Request( m3u8_url + f'?client_id={self.config["CLIENT_ID"]}', self.parse_m3u8_proxy) m3u8_req.meta['track_json'] = track_json yield m3u8_req if result_json['next_href']: url = result_json[ 'next_href'] + f'&client_id={self.config["CLIENT_ID"]}' req = scrapy.Request(url, self.parse_tracks) req.meta['user_sid'] = user_sid yield req def parse_m3u8_proxy(self, response): track_json = response.meta['track_json'] m3u8_url = json.loads(response.body)['url'] m3u8_req = scrapy.Request(m3u8_url, self.parse_m3u8) m3u8_req.meta['track_json'] = track_json yield m3u8_req def parse_m3u8(self, response): track_json = response.meta['track_json'] track_id = track_json['track_id'] track_user_sid = track_json['track_user_sid'] artistdir = f'./tmp/{track_user_sid}' if not os.path.exists(artistdir): os.mkdir(artistdir) trackdir = f'{artistdir}/{track_id}' if not os.path.exists(trackdir): os.mkdir(trackdir) m3u8_path = f'{trackdir}/{track_id}.m3u8' mp3_path = f'{trackdir}/{track_id}.mp3' playlist_path = f'{trackdir}/playlist.m3u8' ts_format = f'{trackdir}/output%03d.ts' with open(m3u8_path, 'wb') as output: output.write(io.BytesIO(response.body).read()) os.system( f'ffmpeg -protocol_whitelist file,http,https,tcp,tls,crypto -i {m3u8_path} -c copy {mp3_path}' ) os.system( f'ffmpeg -i {mp3_path} -c:a libmp3lame -b:a 128k -f segment -segment_time 30 -segment_list {playlist_path} -segment_format mpegts {ts_format}' ) os.remove(m3u8_path)
def __init__(self): self.config = util.load_config() util.register_gcp_credential(self.config) self.target_ids = util.load_target_ids('target.txt') self.dbhandler = DBHandler(self.config) self.gcphandler = GCPHandler(self.config)
class ArtistSpider(scrapy.Spider): name = 'artist' start_urls = ['https://soundcloud.com/'] def __init__(self): self.config = util.load_config() util.register_gcp_credential(self.config) self.target_ids = util.load_target_ids('target.txt') self.dbhandler = DBHandler(self.config) self.gcphandler = GCPHandler(self.config) def parse(self, response): for target_id in self.target_ids: artist_url = f'https://soundcloud.com/{target_id}' yield scrapy.Request(artist_url, self.parse_userpage) def parse_userpage(self, response): userpage_soup = BeautifulSoup(response.body, 'lxml') script = str(userpage_soup.find_all('script')[-1]) script = script.split('"data":[')[-1].replace(']}]);</script>', '') user_info = json.loads(script) user_profile_link = user_info['avatar_url'] user_profile_link = user_profile_link.replace( 'large', 't500x500') if user_profile_link else '' user_banner_link = user_info['visuals'] user_banner_link = user_banner_link['visuals'][0][ 'visual_url'] if user_banner_link else '' user_json = { "user_id": user_info['permalink'], "user_sid": user_info['id'], "user_name": user_info['username'] if user_info['username'] else '', "user_full_name": user_info['full_name'] if user_info['full_name'] else '', "user_description": user_info['description'] if user_info['description'] else '', "user_country": user_info['country_code'] if user_info['country_code'] else '', "user_city": user_info['city'] if user_info['city'] else '', "user_type": 0 } self.dbhandler.insert_user(user_json) if user_profile_link: user_profile_req = scrapy.Request(user_profile_link, self.parse_profile_img) user_profile_req.meta['user_json'] = user_json yield user_profile_req if user_banner_link: user_banner_req = scrapy.Request(user_banner_link, self.parse_banner_img) user_banner_req.meta['user_json'] = user_json yield user_banner_req def parse_profile_img(self, response): user_json = response.meta['user_json'] image = Image.open(io.BytesIO(response.body)) profile_name = f'{user_json["user_id"]}_profile_500x500.jpg' profile_thumnail_name = f'{user_json["user_id"]}_profile_128x128.jpg' image.save(f'./tmp/{profile_name}') image = image.resize((128, 128)) image.save(f'./tmp/{profile_thumnail_name}') profile_url = self.gcphandler.upload_file( f'./tmp/{profile_name}', f'users/profiles/org/{user_json["user_id"]}.jpg') profile_thumbnail_url = self.gcphandler.upload_file( f'./tmp/{profile_thumnail_name}', f'users/profiles/thumbnail/{user_json["user_id"]}.jpg') user_json["user_profile_org"] = profile_url user_json["user_profile_thumbnail"] = profile_thumbnail_url os.remove(f'./tmp/{profile_name}') os.remove(f'./tmp/{profile_thumnail_name}') self.dbhandler.update_user_profile(user_json) def parse_banner_img(self, response): user_json = response.meta['user_json'] image = Image.open(io.BytesIO(response.body)) banner_name = f'{user_json["user_id"]}_banner.jpg' image.save(f'./tmp/{banner_name}') banner_url = self.gcphandler.upload_file( f'./tmp/{banner_name}', f'users/banners/{user_json["user_id"]}.jpg') user_json["user_banner"] = banner_url self.dbhandler.update_user_banner(user_json) os.remove(f'./tmp/{banner_name}')
def __init__(self): warnings.simplefilter("ignore") self.config = util.load_config() util.register_gcp_credential(self.config) self.dbhandler = DBHandler(self.config)
class CommentsSpider(scrapy.Spider): name = 'comment' start_urls = ['https://soundcloud.com/'] def __init__(self): self.config = util.load_config() util.register_gcp_credential(self.config) self.target_ids = util.load_target_ids('target.txt') self.dbhandler = DBHandler(self.config) self.gcphandler = GCPHandler(self.config) warnings.filterwarnings(action='ignore') def parse(self, response): for target_id in self.target_ids: track_infos = self.dbhandler.select_track_ids(target_id) for track_info in track_infos: track_id = track_info[0] url = f'https://api-v2.soundcloud.com/tracks/{track_id}/comments?filter_replies=0&threaded=1&client_id={self.config["CLIENT_ID"]}&offset=0&limit=20&app_version=1595511948&app_locale=en' req = scrapy.Request(url, self.parse_comment) req.meta['cnt'] = 0 req.meta['user_id'] = target_id req.meta['track_id'] = track_id yield req def parse_comment(self, response): cnt = response.meta['cnt'] if cnt > 5: return user_id = response.meta['user_id'] track_id = response.meta['track_id'] comment_json = json.loads(response.body) collections = comment_json['collection'] next_href = comment_json['next_href'] comments = [] for collection in collections: comments.append({ "created_at": self.trim_created_at(collection['created_at']), "comment_user_id": user_id, "comment_uploader_id": collection['user']['permalink'], "comment_track_id": track_id, "comment_body": collection['body'] if collection['body'] else '', }) user_profile_link = collection['user']['avatar_url'] user_profile_link = user_profile_link.replace('large', 't500x500') if user_profile_link else '' user_json = { "user_id": collection['user']['permalink'], "user_sid": collection['user']['id'], "user_name": collection['user']['username'] if collection['user']['username'] else '', "user_full_name": collection['user']['full_name'] if collection['user']['full_name'] else '', "user_description": '', "user_country": collection['user']['country_code'] if collection['user']['country_code'] else '', "user_city": collection['user']['city'] if collection['user']['city'] else '', "user_type": 1 } self.dbhandler.insert_user(user_json) if user_profile_link: user_profile_req = scrapy.Request(user_profile_link, self.parse_profile_img) user_profile_req.meta['user_json'] = user_json yield user_profile_req self.dbhandler.insert_comments(comments) if next_href: req = scrapy.Request(url=next_href, callback=self.parse_comment) req.meta['cnt'] = cnt + 1 req.meta['user_id'] = user_id req.meta['track_id'] = track_id yield req def parse_profile_img(self, response): user_json = response.meta['user_json'] image = Image.open(io.BytesIO(response.body)) profile_name = f'{user_json["user_id"]}_profile_500x500.jpg' profile_thumnail_name = f'{user_json["user_id"]}_profile_128x128.jpg' image.save(f'./tmp/{profile_name}') image = image.resize((128, 128)) image.save(f'./tmp/{profile_thumnail_name}') profile_url = self.gcphandler.upload_file(f'./tmp/{profile_name}', f'users/profiles/org/{user_json["user_id"]}.jpg') profile_thumbnail_url = self.gcphandler.upload_file(f'./tmp/{profile_thumnail_name}', f'users/profiles/thumbnail/{user_json["user_id"]}.jpg') user_json["user_profile_org"] = profile_url user_json["user_profile_thumbnail"] = profile_thumbnail_url os.remove(f'./tmp/{profile_name}') os.remove(f'./tmp/{profile_thumnail_name}') self.dbhandler.update_user_profile(user_json) def trim_created_at(self, created_at_str): created_at_str = created_at_str.replace('T', ' ') created_at_str = created_at_str.replace('Z', '') return created_at_str
class TrackArtworkSpider(scrapy.Spider): name = 'track_artwork' start_urls = ['https://soundcloud.com/'] def __init__(self): warnings.simplefilter("ignore") self.config = util.load_config() util.register_gcp_credential(self.config) self.target_ids = util.load_target_ids('target.txt') self.dbhandler = DBHandler(self.config) self.gcphandler = GCPHandler(self.config) def parse(self, response): user_sids = self.dbhandler.select_user_sids(self.target_ids) url_head = "https://api-v2.soundcloud.com/users/{0}" url_tail = f"/tracks?representation=&client_id={self.config['CLIENT_ID']}&limit=20&offset=0&linked_partitioning=1&app_version=1593604665&app_locale=en" for user_sid in user_sids: url = url_head.format(user_sid) + url_tail req = scrapy.Request(url, self.parse_tracks) req.meta['user_sid'] = user_sid yield req def parse_tracks(self, response): user_sid = response.meta['user_sid'] result_json = json.loads(response.body) collections = result_json['collection'] if not collections: return tracks = [] for collection in collections: m3u8_url = '' try: m3u8_url = collection['media']['transcodings'][0]['url'] except: pass artwork_url = collection['artwork_url'] track_json = { 'created_at': collection['created_at'], 'track_id': collection['id'], 'track_user_sid': user_sid, 'track_title': collection['title'] if collection['title'] else '', 'track_description': collection['description'] if collection['description'] else '', 'track_duration': collection['duration'], 'track_genre': collection['genre'] if collection['genre'] else '', 'track_permalink': collection['permalink_url'] if collection['permalink_url'] else '', 'track_likes_count': collection['likes_count'] if collection['likes_count'] else 0, 'track_playback_count': collection['playback_count'] if collection['playback_count'] else 0, 'track_user_id': collection['user']['permalink'] if collection['user']['permalink'] else '', 'track_user_name': collection['user']['username'] if collection['user']['username'] else '', } if artwork_url: print(artwork_url) artwork_req = scrapy.Request( artwork_url.replace('large', 't500x500'), self.parse_artwork_img) artwork_req.meta['track_json'] = track_json # yield artwork_req tracks.append(track_json) if result_json['next_href']: url = result_json[ 'next_href'] + f'&client_id={self.config["CLIENT_ID"]}' req = scrapy.Request(url, self.parse_tracks) req.meta['user_sid'] = user_sid yield req def parse_artwork_img(self, response): track_json = response.meta['track_json'] image = Image.open(io.BytesIO(response.body)) artwork_name = f'{track_json["track_id"]}_artwork.jpg' artwork_thumbnail = f'{track_json["track_id"]}_artwork_thumb.jpg' image.save(f'./tmp/{artwork_name}') artwork_url = self.gcphandler.upload_file( f'./tmp/{artwork_name}', f'tracks/artwork/org/{track_json["track_id"]}.jpg') image = image.resize((128, 128)) image.save(f'./tmp/{artwork_thumbnail}') artwork_tumbnail_url = self.gcphandler.upload_file( f'./tmp/{artwork_thumbnail}', f'tracks/artwork/thumbnail/{track_json["track_id"]}.jpg') track_json["track_artwork"] = artwork_url track_json["track_artwork_thumbnail"] = artwork_tumbnail_url self.dbhandler.update_track_artwork(track_json) os.remove(f'./tmp/{artwork_name}') os.remove(f'./tmp/{artwork_thumbnail}')