예제 #1
0
def scrape():
    soup = make_soup(WEBSITE)
    articles = []
    data = []
    dates = []

    # Get each individual entry
    articles = get_articles(articles, soup)

    # Get entry data
    for article in articles:
        blog_soup = make_soup(article)

        link = article
        date = get_date(blog_soup)
        title = blog_soup.find("h1", {"class": "article-title"}).text.strip()
        if date is None:
            continue

        elif date in dates:
            date += 1

        dates.append(date)
        data.append(Post(None, date, title, link, SOURCE_CODE, None))

    return data
예제 #2
0
def scrape():
    # Setup selenium
    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--incognito')
    options.add_argument('--headless')
    driver = webdriver.Chrome("/usr/local/bin/chromedriver",
                              chrome_options=options)
    driver.get(WEBSITE)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    articles = []
    data = []

    # Get each individual entry
    articles = get_articles(articles, soup, driver)
    for article in articles:
        link = BASESITE + article.find(lambda tag: tag.name == 'a' and tag.get(
            'class') == ['link']).get("href")
        date = article.find("time").text.strip().replace('-', '') + "0000"
        title = article.find("h1").text.strip()

        data.append(
            Post(None, conform_date(date), title, link, SOURCE_CODE, None))

    return data
예제 #3
0
def scrape():
    data = []
    current_site = WEBSITE

    while current_site is not None:
        soup = make_soup(current_site)
        container_div = soup.find("div", {"class": "c-latest-news"})

        for post in container_div.find_all("div", {"class": "col-12 mb-5 col-lg-4"}):
            date_string = post.find("span", {"class": "published-date"}).text.strip().replace(',', '')
            date = conform_date(date_string)
            title = post.find("h3").text.strip()
            link = post.find("a").get("href")
            alt_image = ALT_IMAGE
            image = get_image(post)

            data.append(Post(None, date, title, link, image, alt_image, SOURCE_CODE, None))

            if len(data) % 50 == 0:
                print(now() + f"Processed {len(data)} posts")

        next_site_div = soup.find("a", {"class": "next"})

        if next_site_div is not None:
            current_site = next_site_div.get("href")

        else:
            current_site = None

    return data
def scrape():

    soup = make_soup(WEBSITE)
    data = []
    for post in soup.find_all("div", {"class": "post-content"}):

        link = post.find("a").get("href")
        date = post.find("p").text.strip().replace('-', '') + "0000"
        title = post.find("h4").text.strip()

        data.append(Post(None, date, title, link, SOURCE_CODE, None))

    return data
예제 #5
0
def scrape():
    soup = make_soup(WEBSITE)
    data = []

    for post in soup.find_all("div", {"class": "post-content"}):
        date = post.find("p").text.strip().replace('-', '')
        title = post.find("h4").text.strip()
        link = post.find("a").get("href")
        alt_image = post.findAll("img")[0].get("src")
        image = get_image(link)

        data.append(
            Post(None, date + "0000", title, link, image, alt_image,
                 SOURCE_CODE, None))

        if len(data) % 25 == 0:
            print(now() + f"Processed {len(data)} posts")

    return data
예제 #6
0
def scrape():
    soup = make_soup(WEBSITE)
    data = []

    for post in soup.find("ul", {"class": "blog_post_list_widget"}):

        date = post.find("abbr").get("title").replace("-", "").replace(" ", "").replace(":", "")[0:-2]
        title = post.find("a", {"class": "title"}).text.strip()
        link = post.find("a", {"class": "title"}).get("href")
        alt_image = ALT_IMAGE
        image_element = post.find("img", {"class": "post_image"})
        image = image_element.get("src").replace(" ", "%20") if image_element else ALT_IMAGE

        data.append(Post(None, date, title, link, image, alt_image, SOURCE_CODE, None))

        if len(data) % 25 == 0:
            print(now() + f"Processed {len(data)} posts")

    return data
예제 #7
0
def scrape():
    soup = make_soup(WEBSITE)
    base_site = "https://windboundgame.com"
    data = []

    for post in soup.find_all("div", {"class": "card--news"}):
        date        = get_date(post.find("p").text.strip())
        title       = post.find("h3").text.strip()
        link        = base_site + post.find("a").get("href")
        alt_image   = ALT_IMAGE
        image       = post.find("img").get("src").replace(" ", "%20")
        print(image)

        data.append(Post(None, date, title, link, image, alt_image, SOURCE_CODE, None))

        if len(data) % 25 == 0:
            print(now() + f"Processed {len(data)} posts")

    return data
def scrape():
    alt_image = get_alt_image()
    soup = make_soup(WEBSITE)

    data = []

    for post in soup.findAll("section")[1].findAll("article"):

        date = post.find("time").text.replace("-", "") + "0000"
        title = post.find("h3").text.strip()
        link = BASESITE + post.find("a").get("href")
        alt_image = alt_image
        image = BASESITE + post.find("picture").find("img").get("src").replace(" ", "%20")

        data.append(Post(None, date, title, link, image, alt_image, SOURCE_CODE, None))

        if len(data) % 25 == 0:
            print(now() + f"Processed {len(data)} posts")

    return data
def scrape():
    data = []
    current_site = WEBSITE

    while current_site is not None:
        soup = make_soup(current_site)

        for post in soup.find_all("article", {"class": "post"}):
            date_string = post.find("span", {
                "class": "date"
            }).text.strip().replace('-', '')
            date = conform_date(date_string)
            title = post.find("h3").text.strip()
            link = post.find("a").get("href")
            alt_image = ALT_IMAGE
            text_with_image = post.find("div", {
                "class": "background--cover"
            }).get("style")
            image = get_image(text_with_image)

            data.append(
                Post(None, date, title, link, image, alt_image, SOURCE_CODE,
                     None))

            if len(data) % 25 == 0:
                print(now() + f"Processed {len(data)} posts")

        next_site_div = soup.find("a", {"class": "next"})

        if next_site_div is not None:
            current_site = next_site_div.get("href")

        else:
            current_site = None

    return data