def scrape(): soup = make_soup(WEBSITE) articles = [] data = [] dates = [] # Get each individual entry articles = get_articles(articles, soup) # Get entry data for article in articles: blog_soup = make_soup(article) link = article date = get_date(blog_soup) title = blog_soup.find("h1", {"class": "article-title"}).text.strip() if date is None: continue elif date in dates: date += 1 dates.append(date) data.append(Post(None, date, title, link, SOURCE_CODE, None)) return data
def scrape(): data = [] current_site = WEBSITE while current_site is not None: soup = make_soup(current_site) container_div = soup.find("div", {"class": "c-latest-news"}) for post in container_div.find_all("div", {"class": "col-12 mb-5 col-lg-4"}): date_string = post.find("span", {"class": "published-date"}).text.strip().replace(',', '') date = conform_date(date_string) title = post.find("h3").text.strip() link = post.find("a").get("href") alt_image = ALT_IMAGE image = get_image(post) data.append(Post(None, date, title, link, image, alt_image, SOURCE_CODE, None)) if len(data) % 50 == 0: print(now() + f"Processed {len(data)} posts") next_site_div = soup.find("a", {"class": "next"}) if next_site_div is not None: current_site = next_site_div.get("href") else: current_site = None return data
def get_image(link): soup = make_soup(link) content = soup.find("div", {"class": "post-content"}) image = content.find("img") if image: return image.get("src") else: return ALT_IMAGE
def get_articles(articles, soup): while True: pagination = soup.find("li", {"class": "pagination-next"}) posts = soup.find("ul", {"class": "article-list"}) for post in posts.find_all("li"): articles.append(BASE_SITE + post.find("a").get("href")) if pagination is not None: soup = make_soup(BASE_SITE + pagination.find("a").get("href")) else: break return articles
def scrape(): soup = make_soup(WEBSITE) data = [] for post in soup.find_all("div", {"class": "post-content"}): link = post.find("a").get("href") date = post.find("p").text.strip().replace('-', '') + "0000" title = post.find("h4").text.strip() data.append(Post(None, date, title, link, SOURCE_CODE, None)) return data
def scrape(): soup = make_soup(WEBSITE) data = [] for post in soup.find_all("div", {"class": "post-content"}): date = post.find("p").text.strip().replace('-', '') title = post.find("h4").text.strip() link = post.find("a").get("href") alt_image = post.findAll("img")[0].get("src") image = get_image(link) data.append( Post(None, date + "0000", title, link, image, alt_image, SOURCE_CODE, None)) if len(data) % 25 == 0: print(now() + f"Processed {len(data)} posts") return data
def scrape(): soup = make_soup(WEBSITE) data = [] for post in soup.find("ul", {"class": "blog_post_list_widget"}): date = post.find("abbr").get("title").replace("-", "").replace(" ", "").replace(":", "")[0:-2] title = post.find("a", {"class": "title"}).text.strip() link = post.find("a", {"class": "title"}).get("href") alt_image = ALT_IMAGE image_element = post.find("img", {"class": "post_image"}) image = image_element.get("src").replace(" ", "%20") if image_element else ALT_IMAGE data.append(Post(None, date, title, link, image, alt_image, SOURCE_CODE, None)) if len(data) % 25 == 0: print(now() + f"Processed {len(data)} posts") return data
def scrape(): soup = make_soup(WEBSITE) base_site = "https://windboundgame.com" data = [] for post in soup.find_all("div", {"class": "card--news"}): date = get_date(post.find("p").text.strip()) title = post.find("h3").text.strip() link = base_site + post.find("a").get("href") alt_image = ALT_IMAGE image = post.find("img").get("src").replace(" ", "%20") print(image) data.append(Post(None, date, title, link, image, alt_image, SOURCE_CODE, None)) if len(data) % 25 == 0: print(now() + f"Processed {len(data)} posts") return data
def scrape(): alt_image = get_alt_image() soup = make_soup(WEBSITE) data = [] for post in soup.findAll("section")[1].findAll("article"): date = post.find("time").text.replace("-", "") + "0000" title = post.find("h3").text.strip() link = BASESITE + post.find("a").get("href") alt_image = alt_image image = BASESITE + post.find("picture").find("img").get("src").replace(" ", "%20") data.append(Post(None, date, title, link, image, alt_image, SOURCE_CODE, None)) if len(data) % 25 == 0: print(now() + f"Processed {len(data)} posts") return data
def scrape(): data = [] current_site = WEBSITE while current_site is not None: soup = make_soup(current_site) for post in soup.find_all("article", {"class": "post"}): date_string = post.find("span", { "class": "date" }).text.strip().replace('-', '') date = conform_date(date_string) title = post.find("h3").text.strip() link = post.find("a").get("href") alt_image = ALT_IMAGE text_with_image = post.find("div", { "class": "background--cover" }).get("style") image = get_image(text_with_image) data.append( Post(None, date, title, link, image, alt_image, SOURCE_CODE, None)) if len(data) % 25 == 0: print(now() + f"Processed {len(data)} posts") next_site_div = soup.find("a", {"class": "next"}) if next_site_div is not None: current_site = next_site_div.get("href") else: current_site = None return data
def get_alt_image(): soup = make_soup(MEDIASITE) return BASESITE + soup.findAll("ul")[3].findAll("a", {"title": "Open large image"})[-1].get("href")