def scrape(): soup = make_soup(WEBSITE) articles = [] data = [] dates = [] # Get each individual entry articles = get_articles(articles, soup) # Get entry data for article in articles: blog_soup = make_soup(article) link = article date = get_date(blog_soup) title = blog_soup.find("h1", {"class": "article-title"}).text.strip() if date is None: continue elif date in dates: date += 1 dates.append(date) data.append(Post(None, date, title, link, SOURCE_CODE, None)) return data
def scrape(): # Setup selenium options = webdriver.ChromeOptions() options.add_argument('--ignore-certificate-errors') options.add_argument('--incognito') options.add_argument('--headless') driver = webdriver.Chrome("/usr/local/bin/chromedriver", chrome_options=options) driver.get(WEBSITE) soup = BeautifulSoup(driver.page_source, "html.parser") articles = [] data = [] # Get each individual entry articles = get_articles(articles, soup, driver) for article in articles: link = BASESITE + article.find(lambda tag: tag.name == 'a' and tag.get( 'class') == ['link']).get("href") date = article.find("time").text.strip().replace('-', '') + "0000" title = article.find("h1").text.strip() data.append( Post(None, conform_date(date), title, link, SOURCE_CODE, None)) return data
def scrape(): data = [] current_site = WEBSITE while current_site is not None: soup = make_soup(current_site) container_div = soup.find("div", {"class": "c-latest-news"}) for post in container_div.find_all("div", {"class": "col-12 mb-5 col-lg-4"}): date_string = post.find("span", {"class": "published-date"}).text.strip().replace(',', '') date = conform_date(date_string) title = post.find("h3").text.strip() link = post.find("a").get("href") alt_image = ALT_IMAGE image = get_image(post) data.append(Post(None, date, title, link, image, alt_image, SOURCE_CODE, None)) if len(data) % 50 == 0: print(now() + f"Processed {len(data)} posts") next_site_div = soup.find("a", {"class": "next"}) if next_site_div is not None: current_site = next_site_div.get("href") else: current_site = None return data
def scrape(): soup = make_soup(WEBSITE) data = [] for post in soup.find_all("div", {"class": "post-content"}): link = post.find("a").get("href") date = post.find("p").text.strip().replace('-', '') + "0000" title = post.find("h4").text.strip() data.append(Post(None, date, title, link, SOURCE_CODE, None)) return data
def scrape(): soup = make_soup(WEBSITE) data = [] for post in soup.find_all("div", {"class": "post-content"}): date = post.find("p").text.strip().replace('-', '') title = post.find("h4").text.strip() link = post.find("a").get("href") alt_image = post.findAll("img")[0].get("src") image = get_image(link) data.append( Post(None, date + "0000", title, link, image, alt_image, SOURCE_CODE, None)) if len(data) % 25 == 0: print(now() + f"Processed {len(data)} posts") return data
def scrape(): soup = make_soup(WEBSITE) data = [] for post in soup.find("ul", {"class": "blog_post_list_widget"}): date = post.find("abbr").get("title").replace("-", "").replace(" ", "").replace(":", "")[0:-2] title = post.find("a", {"class": "title"}).text.strip() link = post.find("a", {"class": "title"}).get("href") alt_image = ALT_IMAGE image_element = post.find("img", {"class": "post_image"}) image = image_element.get("src").replace(" ", "%20") if image_element else ALT_IMAGE data.append(Post(None, date, title, link, image, alt_image, SOURCE_CODE, None)) if len(data) % 25 == 0: print(now() + f"Processed {len(data)} posts") return data
def scrape(): soup = make_soup(WEBSITE) base_site = "https://windboundgame.com" data = [] for post in soup.find_all("div", {"class": "card--news"}): date = get_date(post.find("p").text.strip()) title = post.find("h3").text.strip() link = base_site + post.find("a").get("href") alt_image = ALT_IMAGE image = post.find("img").get("src").replace(" ", "%20") print(image) data.append(Post(None, date, title, link, image, alt_image, SOURCE_CODE, None)) if len(data) % 25 == 0: print(now() + f"Processed {len(data)} posts") return data
def scrape(): alt_image = get_alt_image() soup = make_soup(WEBSITE) data = [] for post in soup.findAll("section")[1].findAll("article"): date = post.find("time").text.replace("-", "") + "0000" title = post.find("h3").text.strip() link = BASESITE + post.find("a").get("href") alt_image = alt_image image = BASESITE + post.find("picture").find("img").get("src").replace(" ", "%20") data.append(Post(None, date, title, link, image, alt_image, SOURCE_CODE, None)) if len(data) % 25 == 0: print(now() + f"Processed {len(data)} posts") return data
def scrape(): data = [] current_site = WEBSITE while current_site is not None: soup = make_soup(current_site) for post in soup.find_all("article", {"class": "post"}): date_string = post.find("span", { "class": "date" }).text.strip().replace('-', '') date = conform_date(date_string) title = post.find("h3").text.strip() link = post.find("a").get("href") alt_image = ALT_IMAGE text_with_image = post.find("div", { "class": "background--cover" }).get("style") image = get_image(text_with_image) data.append( Post(None, date, title, link, image, alt_image, SOURCE_CODE, None)) if len(data) % 25 == 0: print(now() + f"Processed {len(data)} posts") next_site_div = soup.find("a", {"class": "next"}) if next_site_div is not None: current_site = next_site_div.get("href") else: current_site = None return data