Python make_soup Examples, common_src.scrapers.abstract_scraper.make_soup Python Examples

Example #1

0

Show file

def scrape():
    soup = make_soup(WEBSITE)
    articles = []
    data = []
    dates = []

    # Get each individual entry
    articles = get_articles(articles, soup)

    # Get entry data
    for article in articles:
        blog_soup = make_soup(article)

        link = article
        date = get_date(blog_soup)
        title = blog_soup.find("h1", {"class": "article-title"}).text.strip()
        if date is None:
            continue

        elif date in dates:
            date += 1

        dates.append(date)
        data.append(Post(None, date, title, link, SOURCE_CODE, None))

    return data

Example #2

0

Show file

File: zomboid_scraper.py Project: mrPaintMan/blog-scraper

def scrape():
    data = []
    current_site = WEBSITE

    while current_site is not None:
        soup = make_soup(current_site)
        container_div = soup.find("div", {"class": "c-latest-news"})

        for post in container_div.find_all("div", {"class": "col-12 mb-5 col-lg-4"}):
            date_string = post.find("span", {"class": "published-date"}).text.strip().replace(',', '')
            date = conform_date(date_string)
            title = post.find("h3").text.strip()
            link = post.find("a").get("href")
            alt_image = ALT_IMAGE
            image = get_image(post)

            data.append(Post(None, date, title, link, image, alt_image, SOURCE_CODE, None))

            if len(data) % 50 == 0:
                print(now() + f"Processed {len(data)} posts")

        next_site_div = soup.find("a", {"class": "next"})

        if next_site_div is not None:
            current_site = next_site_div.get("href")

        else:
            current_site = None

    return data

Example #3

0

Show file

File: gen_zero_scraper.py Project: mrPaintMan/blog-scraper

def get_image(link):
    soup = make_soup(link)
    content = soup.find("div", {"class": "post-content"})
    image = content.find("img")

    if image:
        return image.get("src")

    else:
        return ALT_IMAGE

Example #4

0

Show file

def get_articles(articles, soup):
    while True:
        pagination = soup.find("li", {"class": "pagination-next"})
        posts = soup.find("ul", {"class": "article-list"})
        for post in posts.find_all("li"):
            articles.append(BASE_SITE + post.find("a").get("href"))

        if pagination is not None:
            soup = make_soup(BASE_SITE + pagination.find("a").get("href"))
        else:
            break
    return articles

Example #5

0

Show file

File: gen_zero_scraper.py Project: filippalmqvist/blog-scraper

def scrape():

    soup = make_soup(WEBSITE)
    data = []
    for post in soup.find_all("div", {"class": "post-content"}):

        link = post.find("a").get("href")
        date = post.find("p").text.strip().replace('-', '') + "0000"
        title = post.find("h4").text.strip()

        data.append(Post(None, date, title, link, SOURCE_CODE, None))

    return data

Example #6

0

Show file

File: gen_zero_scraper.py Project: mrPaintMan/blog-scraper

def scrape():
    soup = make_soup(WEBSITE)
    data = []

    for post in soup.find_all("div", {"class": "post-content"}):
        date = post.find("p").text.strip().replace('-', '')
        title = post.find("h4").text.strip()
        link = post.find("a").get("href")
        alt_image = post.findAll("img")[0].get("src")
        image = get_image(link)

        data.append(
            Post(None, date + "0000", title, link, image, alt_image,
                 SOURCE_CODE, None))

        if len(data) % 25 == 0:
            print(now() + f"Processed {len(data)} posts")

    return data

Example #7

0

Show file

def scrape():
    soup = make_soup(WEBSITE)
    data = []

    for post in soup.find("ul", {"class": "blog_post_list_widget"}):

        date = post.find("abbr").get("title").replace("-", "").replace(" ", "").replace(":", "")[0:-2]
        title = post.find("a", {"class": "title"}).text.strip()
        link = post.find("a", {"class": "title"}).get("href")
        alt_image = ALT_IMAGE
        image_element = post.find("img", {"class": "post_image"})
        image = image_element.get("src").replace(" ", "%20") if image_element else ALT_IMAGE

        data.append(Post(None, date, title, link, image, alt_image, SOURCE_CODE, None))

        if len(data) % 25 == 0:
            print(now() + f"Processed {len(data)} posts")

    return data

Example #8

0

Show file

File: windbound_scraper.py Project: mrPaintMan/blog-scraper

def scrape():
    soup = make_soup(WEBSITE)
    base_site = "https://windboundgame.com"
    data = []

    for post in soup.find_all("div", {"class": "card--news"}):
        date        = get_date(post.find("p").text.strip())
        title       = post.find("h3").text.strip()
        link        = base_site + post.find("a").get("href")
        alt_image   = ALT_IMAGE
        image       = post.find("img").get("src").replace(" ", "%20")
        print(image)

        data.append(Post(None, date, title, link, image, alt_image, SOURCE_CODE, None))

        if len(data) % 25 == 0:
            print(now() + f"Processed {len(data)} posts")

    return data

Example #9

0

Show file

File: second_extinction_scraper.py Project: mrPaintMan/blog-scraper

def scrape():
    alt_image = get_alt_image()
    soup = make_soup(WEBSITE)

    data = []

    for post in soup.findAll("section")[1].findAll("article"):

        date = post.find("time").text.replace("-", "") + "0000"
        title = post.find("h3").text.strip()
        link = BASESITE + post.find("a").get("href")
        alt_image = alt_image
        image = BASESITE + post.find("picture").find("img").get("src").replace(" ", "%20")

        data.append(Post(None, date, title, link, image, alt_image, SOURCE_CODE, None))

        if len(data) % 25 == 0:
            print(now() + f"Processed {len(data)} posts")

    return data

Example #10

0

Show file

File: no_mans_sky_scraper.py Project: mrPaintMan/blog-scraper

def scrape():
    data = []
    current_site = WEBSITE

    while current_site is not None:
        soup = make_soup(current_site)

        for post in soup.find_all("article", {"class": "post"}):
            date_string = post.find("span", {
                "class": "date"
            }).text.strip().replace('-', '')
            date = conform_date(date_string)
            title = post.find("h3").text.strip()
            link = post.find("a").get("href")
            alt_image = ALT_IMAGE
            text_with_image = post.find("div", {
                "class": "background--cover"
            }).get("style")
            image = get_image(text_with_image)

            data.append(
                Post(None, date, title, link, image, alt_image, SOURCE_CODE,
                     None))

            if len(data) % 25 == 0:
                print(now() + f"Processed {len(data)} posts")

        next_site_div = soup.find("a", {"class": "next"})

        if next_site_div is not None:
            current_site = next_site_div.get("href")

        else:
            current_site = None

    return data

Example #11

0

Show file

File: second_extinction_scraper.py Project: mrPaintMan/blog-scraper

def get_alt_image():
    soup = make_soup(MEDIASITE)
    return BASESITE + soup.findAll("ul")[3].findAll("a", {"title": "Open large image"})[-1].get("href")