Python get_hrefs Beispiele

Programmiersprache: Python

Namespace / Paketname: scrapeutil

Methode / Funktion: get_hrefs

Beispiele auf hotexamples.com: 5

Python get_hrefs - 5 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die scrapeutil.get_hrefs, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Beispiel #1

Datei anzeigen

def scrape() -> None:
    """Scrape the subcategories from www.billboard.com. Returns None."""
    print("--- SCRAPE SUBCATEGORIES, STARTED ---")
    todo, finished = scrape_setup(YEAR_FIN, SUB_CATEGORY_FIN)
    fin_len = len(finished)
    todo_len = len(todo)
    print("finished:", fin_len)
    print("todo    :", todo_len)

    errors = load_file_list(SUB_CATEGORY_ERR)
    allurls = []
    for link in todo:
        try:
            soup = get_soup(link, filter_=A_TAGS)
            year = get_year(link)
            pattern = format_regex_string(year)
            a_tags = get_links(soup, pattern)
            hrefs = get_hrefs(a_tags)
            allurls += list(map(lambda x: HOME_PAGE + x, hrefs))
            print("Done:", link)
        except:
            print("Error:", link)
            errors.append(link)
            save_append_line(link, SUB_CATEGORY_ERR)
    save(allurls, SUB_CATEGORY_FIN)
    save(list(set(errors)), SUB_CATEGORY_ERR)
    print("--- SCRAPE SUBCATEGORIES, FINISHED ---")
    return None

Beispiel #2

Datei anzeigen

Datei: scrapeissues.py Projekt: wmcooper2/billboard-top-100-scraper

def scrape() -> None:
    """Scrapes links from issue-date pages from www.billboard.com. 
        Returns None."""
    print("--- ISSUE DATE SCRAPING, STARTED --- ")
    todo, finished = scrape_setup(SUB_CATEGORY_FIN, ISSUE_FIN)
    fin_len = len(finished)
    todo_len = len(todo)
    print("finished:", fin_len)
    print("todo    :", todo_len)

    errors = load_file_list(ISSUE_ERR)
    for link in todo:
        try:
            soup = get_soup(link, filter_=A_TAGS)

            #rearrange url suffix
            charts = Path(link).parts[-3]
            subcat = Path(link).parts[-1]

            pattern = format_search_string(charts, subcat)
            issuedates = get_links(soup, pattern)
            hrefs = get_hrefs(issuedates)
            links = sorted(list(map(lambda x: HOME_PAGE + x, hrefs)))
            save_append(links, ISSUE_FIN)
            print("Saved :: ", link)
        except AttributeError:
            errors.append(link)
        except KeyboardInterrupt:
            print("Stopped manually.")
    save(errors, list(set(ISSUE_ERR)))
    print("--- ISSUE DATE SCRAPING, FINISHED --- ")
    return None

Beispiel #3

Datei anzeigen

Datei: scrapeyears.py Projekt: wmcooper2/billboard-top-100-scraper

def scrape() -> None:
    """Scrapes the year-links from www.billboard.com. Returns None."""
    print("--- SCRAPING YEARS, STARTED ---")
    soup = get_soup(YEAR_PAGE, filter_=A_TAGS)
    links = get_links(soup, "\/archive\/charts\/[0-9]*")
    links = get_hrefs(links)
    links = list(map(lambda x: HOME_PAGE + x, links))
    save(links, YEAR_FIN)
    print("--- SCRAPING YEARS, FINISHED ---")
    return None

Beispiel #4

Datei anzeigen

Datei: scrapecategories.py Projekt: wmcooper2/lyricscraping

def scrape() -> None:
    print("--- CATEGORY SCRAPING STARTED ---")
    print("Scraping from:", HOME_PAGE)
    soup = get_soup(HOME_PAGE)
    category_links = get_links(soup, "^/artists/")
    a_tags = set(category_links)
    hrefs = get_hrefs(a_tags)
    suffixed = list(map(lambda x: x+"/99999", hrefs))
    prefixed = list(map(lambda x: HOME_PAGE+x, suffixed))
    save(prefixed, CATEGORY_FIN)
    print("--- CATEGORY SCRAPING FINISHED ---")

Beispiel #5

Datei anzeigen

def scrape() -> None:
    """Main scraping function. Returns None."""
    print("--- SONG SCRAPING, START ---")
    todo, finished = scrape_setup_song(ARTIST_DIR, SONG_FIN)
    print("Finished:", len(finished))
    print("To do   :", len(todo))

    errors = load_file_list(SONG_ERRORS)
    for thing in sorted(todo):
        try:
            soup = get_soup(thing)
            a_tags = get_links(soup, "^/lyric/")
            hrefs = get_hrefs(a_tags)
            links = list(map(lambda x: unquote(HOME_PAGE + x), hrefs))
            save_append(links, LYRIC_TODO)
            save_append_line(thing, SONG_FIN)
        except:
            errors.append(thing)

    save(list(set(errors)), SONG_ERRORS)
    print("--- SONG SCRAPING, FINISHED ---")