def search(name, page=1, albums=True, tracks=True, artists=True, labels=False): params = {"page": page, "q": name} response = requests.get('http://bandcamp.com/search', params=params) html_doc = response.content soup = BeautifulSoup(html_doc, 'html.parser') seen = [] for item in soup.find_all("li", class_="searchresult"): item_type = item.find('div', class_='itemtype').text.strip().lower() if item_type == "album" and albums: data = BandCamper._parse_album(item) elif item_type == "track" and tracks: data = BandCamper._parse_track(item) elif item_type == "artist" and artists: data = BandCamper._parse_artist(item) elif item_type == "label" and labels: data = BandCamper._parse_label(item) else: continue #data["type"] = type yield data seen.append(data) if not len(seen): return # no more pages for item in BandCamper.search(name, page=page + 1, albums=albums, tracks=tracks, artists=artists, labels=labels): if item in seen: return # duplicate data, fail safe out of loops yield item
def get_stream_data(url): txt_string = requests.get(url).text json_blob = txt_string. \ split('<script type="application/ld+json">')[-1]. \ split("</script>")[0] data = json.loads(json_blob) from pprint import pprint pprint(data) artist_data = data['byArtist'] album_data = data['inAlbum'] result = { "categories": data["@type"], 'album_name': album_data['name'], 'artist': artist_data['name'], 'image': data['image'], "title": data['name'], "url": url, "tags": data['keywords'].split(", ") + data["tags"] } for p in data['additionalProperty']: if p['name'] == 'file_mp3-128': result["stream"] = p["value"] if p['name'] == 'duration_secs': result["length"] = p["value"] * 1000 return result
def extract_ldjson_blob(url, clean=False): txt_string = requests.get(url).text json_blob = txt_string. \ split('<script type="application/ld+json">')[-1]. \ split("</script>")[0] data = json.loads(json_blob) def _clean_list(l): for idx, v in enumerate(l): if isinstance(v, dict): l[idx] = _clean_dict(v) if isinstance(v, list): l[idx] = _clean_list(v) return l def _clean_dict(d): clean = {} for k, v in d.items(): if isinstance(v, dict): v = _clean_dict(v) if isinstance(v, list): v = _clean_list(v) k = k.replace("@", "") clean[k] = v return clean if clean: return _clean_dict(data) return data
def get_track_lyrics(track_url): track_page = requests.get(track_url) track_soup = BeautifulSoup(track_page.text, 'html.parser') track_lyrics = track_soup.find("div", {"class": "lyricsText"}) if track_lyrics: return track_lyrics.text return "lyrics unavailable"
def extract_blob(url, params=None): blob = requests.get(url, params=params).text for b in blob.split("data-blob='")[1:]: json_blob = b.split("'")[0] return json.loads(json_blob) for b in blob.split("data-blob=\"")[1:]: json_blob = b.split("\"")[0].replace(""", '"') return json.loads(json_blob)
def get_albums(url): albums = [] soup = BeautifulSoup(requests.get(url).text, "html.parser") for album in soup.find_all("a"): album_url = album.find("p", {"class": "title"}) if album_url: title = album_url.text.strip() art = album.find("div", {"class": "art"}).find("img")["src"] album_url = url + album["href"] album = BandcampAlbum({"album_name": title, "image": art, "url": album_url}) albums.append(album) return albums