def scrape() -> None: """Scrape the subcategories from www.billboard.com. Returns None.""" print("--- SCRAPE SUBCATEGORIES, STARTED ---") todo, finished = scrape_setup(YEAR_FIN, SUB_CATEGORY_FIN) fin_len = len(finished) todo_len = len(todo) print("finished:", fin_len) print("todo :", todo_len) errors = load_file_list(SUB_CATEGORY_ERR) allurls = [] for link in todo: try: soup = get_soup(link, filter_=A_TAGS) year = get_year(link) pattern = format_regex_string(year) a_tags = get_links(soup, pattern) hrefs = get_hrefs(a_tags) allurls += list(map(lambda x: HOME_PAGE + x, hrefs)) print("Done:", link) except: print("Error:", link) errors.append(link) save_append_line(link, SUB_CATEGORY_ERR) save(allurls, SUB_CATEGORY_FIN) save(list(set(errors)), SUB_CATEGORY_ERR) print("--- SCRAPE SUBCATEGORIES, FINISHED ---") return None
def scrape() -> None: """Scrapes links from issue-date pages from www.billboard.com. Returns None.""" print("--- ISSUE DATE SCRAPING, STARTED --- ") todo, finished = scrape_setup(SUB_CATEGORY_FIN, ISSUE_FIN) fin_len = len(finished) todo_len = len(todo) print("finished:", fin_len) print("todo :", todo_len) errors = load_file_list(ISSUE_ERR) for link in todo: try: soup = get_soup(link, filter_=A_TAGS) #rearrange url suffix charts = Path(link).parts[-3] subcat = Path(link).parts[-1] pattern = format_search_string(charts, subcat) issuedates = get_links(soup, pattern) hrefs = get_hrefs(issuedates) links = sorted(list(map(lambda x: HOME_PAGE + x, hrefs))) save_append(links, ISSUE_FIN) print("Saved :: ", link) except AttributeError: errors.append(link) except KeyboardInterrupt: print("Stopped manually.") save(errors, list(set(ISSUE_ERR))) print("--- ISSUE DATE SCRAPING, FINISHED --- ") return None
def scrape() -> None: """Scrapes the year-links from www.billboard.com. Returns None.""" print("--- SCRAPING YEARS, STARTED ---") soup = get_soup(YEAR_PAGE, filter_=A_TAGS) links = get_links(soup, "\/archive\/charts\/[0-9]*") links = get_hrefs(links) links = list(map(lambda x: HOME_PAGE + x, links)) save(links, YEAR_FIN) print("--- SCRAPING YEARS, FINISHED ---") return None
def scrape() -> None: print("--- CATEGORY SCRAPING STARTED ---") print("Scraping from:", HOME_PAGE) soup = get_soup(HOME_PAGE) category_links = get_links(soup, "^/artists/") a_tags = set(category_links) hrefs = get_hrefs(a_tags) suffixed = list(map(lambda x: x+"/99999", hrefs)) prefixed = list(map(lambda x: HOME_PAGE+x, suffixed)) save(prefixed, CATEGORY_FIN) print("--- CATEGORY SCRAPING FINISHED ---")
def scrape() -> None: """Main scraping function. Returns None.""" print("--- SONG SCRAPING, START ---") todo, finished = scrape_setup_song(ARTIST_DIR, SONG_FIN) print("Finished:", len(finished)) print("To do :", len(todo)) errors = load_file_list(SONG_ERRORS) for thing in sorted(todo): try: soup = get_soup(thing) a_tags = get_links(soup, "^/lyric/") hrefs = get_hrefs(a_tags) links = list(map(lambda x: unquote(HOME_PAGE + x), hrefs)) save_append(links, LYRIC_TODO) save_append_line(thing, SONG_FIN) except: errors.append(thing) save(list(set(errors)), SONG_ERRORS) print("--- SONG SCRAPING, FINISHED ---")