Beispiel #1
0
def scrape() -> None:
    """Scrape the subcategories from www.billboard.com. Returns None."""
    print("--- SCRAPE SUBCATEGORIES, STARTED ---")
    todo, finished = scrape_setup(YEAR_FIN, SUB_CATEGORY_FIN)
    fin_len = len(finished)
    todo_len = len(todo)
    print("finished:", fin_len)
    print("todo    :", todo_len)

    errors = load_file_list(SUB_CATEGORY_ERR)
    allurls = []
    for link in todo:
        try:
            soup = get_soup(link, filter_=A_TAGS)
            year = get_year(link)
            pattern = format_regex_string(year)
            a_tags = get_links(soup, pattern)
            hrefs = get_hrefs(a_tags)
            allurls += list(map(lambda x: HOME_PAGE + x, hrefs))
            print("Done:", link)
        except:
            print("Error:", link)
            errors.append(link)
            save_append_line(link, SUB_CATEGORY_ERR)
    save(allurls, SUB_CATEGORY_FIN)
    save(list(set(errors)), SUB_CATEGORY_ERR)
    print("--- SCRAPE SUBCATEGORIES, FINISHED ---")
    return None
def scrape() -> None:
    """Scrapes links from issue-date pages from www.billboard.com. 
        Returns None."""
    print("--- ISSUE DATE SCRAPING, STARTED --- ")
    todo, finished = scrape_setup(SUB_CATEGORY_FIN, ISSUE_FIN)
    fin_len = len(finished)
    todo_len = len(todo)
    print("finished:", fin_len)
    print("todo    :", todo_len)

    errors = load_file_list(ISSUE_ERR)
    for link in todo:
        try:
            soup = get_soup(link, filter_=A_TAGS)

            #rearrange url suffix
            charts = Path(link).parts[-3]
            subcat = Path(link).parts[-1]

            pattern = format_search_string(charts, subcat)
            issuedates = get_links(soup, pattern)
            hrefs = get_hrefs(issuedates)
            links = sorted(list(map(lambda x: HOME_PAGE + x, hrefs)))
            save_append(links, ISSUE_FIN)
            print("Saved :: ", link)
        except AttributeError:
            errors.append(link)
        except KeyboardInterrupt:
            print("Stopped manually.")
    save(errors, list(set(ISSUE_ERR)))
    print("--- ISSUE DATE SCRAPING, FINISHED --- ")
    return None
def scrape() -> None:
    """Scrapes the year-links from www.billboard.com. Returns None."""
    print("--- SCRAPING YEARS, STARTED ---")
    soup = get_soup(YEAR_PAGE, filter_=A_TAGS)
    links = get_links(soup, "\/archive\/charts\/[0-9]*")
    links = get_hrefs(links)
    links = list(map(lambda x: HOME_PAGE + x, links))
    save(links, YEAR_FIN)
    print("--- SCRAPING YEARS, FINISHED ---")
    return None
def scrape() -> None:
    print("--- CATEGORY SCRAPING STARTED ---")
    print("Scraping from:", HOME_PAGE)
    soup = get_soup(HOME_PAGE)
    category_links = get_links(soup, "^/artists/")
    a_tags = set(category_links)
    hrefs = get_hrefs(a_tags)
    suffixed = list(map(lambda x: x+"/99999", hrefs))
    prefixed = list(map(lambda x: HOME_PAGE+x, suffixed))
    save(prefixed, CATEGORY_FIN)
    print("--- CATEGORY SCRAPING FINISHED ---")
Beispiel #5
0
def scrape() -> None:
    """Main scraping function. Returns None."""
    print("--- SONG SCRAPING, START ---")
    todo, finished = scrape_setup_song(ARTIST_DIR, SONG_FIN)
    print("Finished:", len(finished))
    print("To do   :", len(todo))

    errors = load_file_list(SONG_ERRORS)
    for thing in sorted(todo):
        try:
            soup = get_soup(thing)
            a_tags = get_links(soup, "^/lyric/")
            hrefs = get_hrefs(a_tags)
            links = list(map(lambda x: unquote(HOME_PAGE + x), hrefs))
            save_append(links, LYRIC_TODO)
            save_append_line(thing, SONG_FIN)
        except:
            errors.append(thing)

    save(list(set(errors)), SONG_ERRORS)
    print("--- SONG SCRAPING, FINISHED ---")