async def create_soup(self, url, params=None): """Run a GET request to Deezer's JSON API for album data.""" params = params or {} album_id = self.parse_release_id(url) try: data = await self.get_json(f"/album/{album_id}", params=params) data["tracklist"] = await self.get_tracks(f"/album/{album_id}", params) return data except json.decoder.JSONDecodeError as e: raise ScrapeError(f"Deezer page did not return valid JSON.") from e except (KeyError, ScrapeError) as e: raise ScrapeError(f"Failed to grab metadata for {url}.") from e
def get_json_sync(self, url, params=None, headers=None): """Make a synchronius get request, usually called by the async get_json.""" params = {**(params or {}), **(self.get_params)} headers = {**(headers or {}), **HEADERS} try: result = requests.get(self.url + url, params=params, headers=headers, timeout=7) if result.status_code != 200: raise ScrapeError(f"Status code {result.status_code}.", result.json()) return result.json() except json.decoder.JSONDecodeError as e: raise ScrapeError("Did not receive JSON from API.") from e
def parse_release_date(self, soup): try: date = soup.select( 'span[itemprop="datePublished"]')[0].string.strip() return datetime.strptime(date, "%d %B, %Y").strftime("%Y-%m-%d") except IndexError as e: raise ScrapeError("Could not parse release date.") from e
def parse_tracks(self, soup): tracks = defaultdict(dict) cur_disc = 1 for track in soup.select(".web-preview"): try: try: num = (track.select(".song-index")[0].select( ".column-data")[0].string.strip()) except IndexError: continue raw_title = track.select(".song-name")[0].text.strip() title = RE_FEAT.sub("", raw_title) explicit = bool(track.select(".badge.explicit.default")) # Itunes silently increments disc number. if int(num) == 1 and num in tracks[str(cur_disc)]: cur_disc += 1 tracks[str(cur_disc)][num] = self.generate_track( trackno=int(num), discno=cur_disc, artists=parse_artists(soup, track, raw_title), title=title, explicit=explicit, ) except (ValueError, IndexError) as e: raise e raise ScrapeError("Could not parse tracks.") from e return dict(tracks)
async def create_soup(self, url, params=None): try: return await self.get_json(f"/releases/{self.regex.match(url)[1]}", params=params) except json.decoder.JSONDecodeError as e: raise ScrapeError( f"Discogs page did not return valid JSON.") from e
async def search_releases(self, searchstr, limit): releases = {} soup = await self.create_soup(self.search_url, params={"q": searchstr}) for meta in soup.select(".bucket-items.ec-bucket li .release-meta"): try: rls_id = int( re.search(r"/release/.+?/(\d+)$", meta.find("a")["href"])[1]) ar_li = [ a.string for a in meta.select(".release-artists a") if a.string ] title = next(t.string for t in meta.select(".release-title a") if t.string) artists = (", ".join(ar_li) if len(ar_li) < 4 else config.VARIOUS_ARTIST_WORD) label = meta.select(".release-label a")[0].string if label.lower() not in config.SEARCH_EXCLUDED_LABELS: releases[rls_id] = ( IdentData(artists, title, None, None, "WEB"), self.format_result(artists, title, label), ) except (TypeError, IndexError) as e: raise ScrapeError( "Failed to parse scraped search results.") from e if len(releases) == limit: break return "Beatport", releases
def parse_release_catno(self, soup): try: catblob = soup.find_all('div', attrs={'class': 'mb-3'})[1] return catblob.find('strong', text='Cat:').next_sibling.strip().replace( " ", "") except IndexError as e: raise ScrapeError("Could not parse catalog number.") from e
def parse_cover_url(self, soup): try: # Just choosing the last artwork url here. art = (soup.select(".product-lockup__artwork-for-product") [0].img['srcset'].split(",")) return art[-1].split()[0] except (TypeError, IndexError) as e: raise ScrapeError("Could not parse cover URL.") from e
def parse_release_label(self, soup): try: artist = soup.select('#name-section span[itemprop="byArtist"] a')[0].string label = soup.select("#band-name-location .title")[0].string if artist != label: return label except IndexError as e: raise ScrapeError("Could not parse record label.") from e
def parse_genres(self, soup): try: genre_str = re.sub( r"[^A-Za-z]+$", "", soup.select('meta[itemprop="genre"]')[0]["content"] ) return {"Electronic", *(set(genre_str.split("/")))} except TypeError as e: raise ScrapeError("Could not parse genres.") from e
def parse_release_date(self, soup): try: date = re.search( r"release(?:d|s) ([^\d]+ \d+, \d{4})", soup.select(".tralbumData.tralbum-credits")[0].text, )[1] return datetime.strptime(date, "%B %d, %Y").strftime("%Y-%m-%d") except (TypeError, IndexError) as e: raise ScrapeError("Could not parse release date.") from e
def parse_cover_url(self, soup): try: return ( soup.select(".img-release img")[0]["src"][::-1] .replace("MED"[::-1], "BIG"[::-1], 1)[::-1] .replace("/300/", "/full/") ) except (TypeError, IndexError) as e: raise ScrapeError("Could not parse cover URL.") from e
def parse_cover_url(self, soup): try: art = soup.select( "picture.product-artwork.product-artwork--captioned" ".we-artwork--fullwidth.we-artwork.ember-view source" )[0]["srcset"] return re.search(r",(https://[^,]+\.jpg) 3x", art)[1] except (TypeError, IndexError) as e: raise ScrapeError("Could not parse cover URL.") from e
async def search_releases(self, searchstr, limit): releases = {} soup = await self.create_soup( self.search_url, params={ "submit-search": "SEARCH", "solrorder": "relevancy", "q[all][]": [searchstr], }, allow_redirects=False, ) for meta in soup.select( "#page_nav + .product-list .productlist_widget_product_detail" ): try: header_type = meta.select( "div.productlist_widget_product_info")[0]["ua_location"] if header_type != "release header": # F**k sample packs, etc. continue su_title = meta.select( ".productlist_widget_product_title .jq_highlight.pwrtext a" )[0] rls_id = re.search(r"/products/[^/]+/([\d-]+)", su_title["href"])[1] title = su_title.string date = meta.select( ".productlist_widget_product_preview_buy span")[0].string year = 2000 + int(date[-2:]) ar_li = [ a.string.title() for a in meta.select( ".productlist_widget_product_artists .jq_highlight.pwrtext a" ) if a.string ] artists = (", ".join(ar_li) if ar_li and len(ar_li) < 5 else config.VARIOUS_ARTIST_WORD) label = meta.select( ".productlist_widget_product_label .jq_highlight.pwrtext a" )[0].string.strip() catno = (meta.select(".productlist_widget_product_preview_buy") [0].text.split("\n")[1].strip()) if label.lower() not in config.SEARCH_EXCLUDED_LABELS: releases[rls_id] = ( IdentData(artists, title, year, None, "WEB"), self.format_result(artists, title, f"{year} {label} {catno}"), ) except (TypeError, IndexError) as e: raise ScrapeError( "Failed to parse scraped search results.") from e if len(releases) == limit: break return "Junodownload", releases
def parse_release_label(self, soup): try: return parse_copyright( soup.select(".product-hero__tracks .link-list__item--copyright")[ 0 ].string ) except IndexError as e: raise ScrapeError("Could not parse record label.") from e
async def create_soup(self, url, params=None): """Run a GET request to Tidal's JSON API for album data.""" params = params or {} album_id = self.parse_release_id(url) for cc in get_tidal_regions_to_fetch(): try: self.country_code = cc params["countrycode"] = cc data = await self.get_json(f"/albums/{album_id}", params=params) tracklist = await self.get_json( f"/albums/{album_id}/tracks", params=params ) data["tracklist"] = tracklist["items"] return data except json.decoder.JSONDecodeError as e: raise ScrapeError(f"Tidal page did not return valid JSON.") from e except (KeyError, ScrapeError): pass raise ScrapeError(f"Failed to grab metadata for {url}.")
def parse_genres(self, soup): genres = set() try: for a in soup.select(".tralbumData.tralbum-tags a"): try: genres |= fetch_genre(a.string) except GenreNotInWhitelist: pass return genres except TypeError as e: raise ScrapeError("Could not parse genres.") from e
async def search_releases(self, searchstr, limit): releases = {} soup = await self.create_soup( self.search_url, params={ "submit-search": "SEARCH", "solrorder": "relevancy", "q[all][]": [searchstr], }, allow_redirects=False, ) for meta in soup.find_all('div', attrs={ 'class': 'row gutters-sm jd-listing-item', 'data-ua_location': 'release' }): try: su_title = meta.find('a', attrs={'class': 'juno-title'}) rls_id = re.search(r"/products/[^/]+/([\d-]+)", su_title["href"])[1] title = su_title.string right_blob = meta.find('div', attrs={'class': 'text-sm mb-3 mb-lg-4'}) date = right_blob.find('br').next_sibling.strip() year = 2000 + int(date[-2:]) catno = right_blob.find('br').previous_sibling.strip().replace( ' ', '') ar_blob = meta.find('div', attrs={'class': 'col juno-artist'}) ar_li = [ a.string.title() for a in ar_blob.find_all('a') if a.string ] artists = (", ".join(ar_li) if ar_li and len(ar_li) < 5 else config.VARIOUS_ARTIST_WORD) label_blob = meta.find('a', attrs={'class': 'juno-label'}) label = label_blob.text.strip() if label.lower() not in config.SEARCH_EXCLUDED_LABELS: releases[rls_id] = ( IdentData(artists, title, year, None, "WEB"), self.format_result(artists, title, f"{year} {label} {catno}"), ) except (TypeError, IndexError) as e: raise ScrapeError( "Failed to parse scraped search results.") from e if len(releases) == limit: break return "Junodownload", releases
def get_tidal_regions_to_fetch(): if config.TIDAL_FETCH_REGIONS: return config.TIDAL_FETCH_REGIONS else: try: from plugins.downloader.accounts import ACCOUNTS if "Tidal" in ACCOUNTS: return [k for k, v in ACCOUNTS["Tidal"].items() if v] except ImportError: pass raise ScrapeError(f"No regions defined for Tidal to grab from")
def parse_genres(self, soup): try: genre = soup.select( ".product-header__list .inline-list " "li.inline-list__item.inline-list__item--bulleted a" )[0].string try: return ALIAS_GENRE[genre] except KeyError: return {genre.strip()} except (TypeError, IndexError) as e: raise ScrapeError("Could not parse genres.") from e
async def run_metadata(url, sources=None, return_source_name=False): """Run a scrape for the metadata of a URL""" sources = (METASOURCES if not sources else { name: source for name, source in METASOURCES.items() if name in sources }) for name, source in sources.items(): if source.Scraper.regex.match(url): click.secho(f"Getting metadata from {name}.", fg="cyan") if return_source_name: return await source.Scraper().scrape_release(url), name return await source.Scraper().scrape_release(url) raise ScrapeError("URL did not match a scraper.")
async def get_tracks(self, url, params=None): track_data = await loop.run_in_executor( None, lambda: self.sesh.get(self.site_url + url, params=(params or {}))) r = re.search( r"window.__DZR_APP_STATE__ = ({.*?}})</script>", track_data.text.replace("\n", ""), ) if not r: raise ScrapeError("Failed to scrape track data.") raw = re.sub(r"{(\s*)type\: +\'([^\']+)\'", r'{\1type: "\2"', r[1]) raw = re.sub("\t+([^:]+): ", r'"\1":', raw) return json.loads(raw)["SONGS"]["data"]
def parse_genres(self, soup): try: info = json.loads( soup.find(attrs={ "name": "schema:music-album" }).text) genres = { g for gs in info['genre'] for g in ALIAS_GENRE.get(gs, [gs]) } # either replace with alias (which can be more than one tag) or return untouched. return genres except (TypeError, IndexError) as e: raise ScrapeError("Could not parse genres.") from e
def parse_tracks(self, soup): tracks = defaultdict(dict) cur_disc = 1 for track in soup.select("#product_tracklist tbody tr .col-title"): try: num, title = [t.strip() for t in track.text.split(".", 1)] tracks[str(cur_disc)][num] = self.generate_track( trackno=(num), discno=cur_disc, artists=parse_artists(soup, track, title), title=parse_title(title, track), ) except (ValueError, IndexError) as e: raise ScrapeError("Could not parse tracks.") from e return dict(tracks)
async def search_releases(self, searchstr, limit): releases = {} soup = await loop.run_in_executor(None, musicbrainzngs.search_releases, searchstr, 10) for rls in soup["release-list"]: try: artists = rls["artist-credit-phrase"] try: track_count = rls["medium-track-count"] except KeyError: track_count = None label = catno = "" if ("label-info-list" in rls and rls["label-info-list"] and "label" in rls["label-info-list"][0] and "name" in rls["label-info-list"][0]["label"]): label = rls["label-info-list"][0]["label"]["name"] if "catalog_number" in rls["label-info-list"][0]: catno = rls["label-info-list"][0]["catalog_number"] try: source = rls["medium-list"][0]["format"] except KeyError: source = None edition = "" if label: edition += label if catno: edition += " " + catno if label.lower() not in config.SEARCH_EXCLUDED_LABELS: releases[rls["id"]] = ( IdentData(artists, rls["title"], None, track_count, source), self.format_result( artists, rls["title"], edition, ed_title=source, track_count=track_count, ), ) except (TypeError, IndexError) as e: raise ScrapeError( "Failed to parse scraped search results.") from e if len(releases) == limit: break return "MusicBrainz", releases
def parse_tracks(self, soup): tracks = defaultdict(dict) for disc in soup["medium-list"]: for track in disc["track-list"]: try: tracks[str(disc["position"])][ str(track["number"]) ] = self.generate_track( trackno=track["number"], discno=disc["position"], artists=parse_artists(track["recording"]["artist-credit"]), title=track["recording"]["title"], ) except (ValueError, IndexError) as e: raise ScrapeError("Could not parse tracks.") from e return dict(tracks)
async def create_soup(self, url, params=None, headers=None, **kwargs): """ Asynchroniously run a webpage scrape and return a BeautifulSoup object containing the scraped HTML. """ params = params or {} r = await loop.run_in_executor( None, lambda: requests.get( url, params=params, headers=HEADERS, timeout=7, **kwargs), ) if r.status_code != 200: raise ScrapeError( f"Failed to successfully scrape page. Status code: {r.status_code}" ) return BeautifulSoup(r.text, "html.parser")
def parse_tracks(self, soup): tracks = defaultdict(dict) cur_disc = 1 tracks_sc = soup.select(".bucket.tracks.interior-release-tracks " ".bucket-item.ec-item.track") for track in tracks_sc: try: track_num = track.select(".buk-track-num")[0].string tracks[str(cur_disc)][track_num] = self.generate_track( trackno=track_num, discno=cur_disc, artists=parse_artists(track), title=parse_title(track), ) except (ValueError, IndexError) as e: raise ScrapeError("Could not parse tracks.") from e return dict(tracks)
def parse_tracks(self, soup): tracks = defaultdict(dict) cur_disc = 1 for track in soup.find_all( 'div', attrs={ 'class': 'row gutters-sm align-items-center product-tracklist-track' }, ): try: num = track.text.strip().split(".", 1)[0] tobj = track.find('div', attrs={'class': 'col track-title'}) title = tobj.find('a').text tracks[str(cur_disc)][num] = self.generate_track( trackno=(num), discno=cur_disc, artists=parse_artists(soup, track, title), title=parse_title(title, track), ) except (ValueError, IndexError) as e: raise ScrapeError("Could not parse tracks.") from e return dict(tracks)
def parse_tracks(self, soup): tracks = defaultdict(dict) cur_disc = 1 for track in soup.select(".product-hero__tracks tr.table__row"): try: if track.select(".icon-musicvideo"): continue try: num = track.select(".table__row__track span.table__row__number")[ 0 ].string.strip() except IndexError: continue raw_title = track.select( ".table__row__name .table__row__titles .table__row__headline" )[0].text.strip() title = RE_FEAT.sub("", raw_title) explicit = bool( track.select(".table__row__name .table__row__titles .icon-explicit") ) # iTunes silently increments disc. if int(num) == 1 and int(num) in tracks[str(cur_disc)]: cur_disc += 1 tracks[str(cur_disc)][num] = self.generate_track( trackno=int(num), discno=cur_disc, artists=parse_artists(soup, track, raw_title), title=title, explicit=explicit, ) except (ValueError, IndexError) as e: raise e raise ScrapeError("Could not parse tracks.") from e return dict(tracks)