def main(): new_books_url = "https://www.piter.com/collection/new" soon_books_url = "https://www.piter.com/collection/soon" new_page = get_page(new_books_url) soon_page = get_page(soon_books_url) new_content = web(new_page, "html.parser") soon_content = web(soon_page, "html.parser") new_books = books_titles(new_content) soon_books = books_titles(soon_content) print_titles("=== new === ", new_books) print_titles("=== soon ===", soon_books)
def get_books(): new_books_url = "https://www.piter.com/collection/new" soon_books_url = "https://www.piter.com/collection/soon" new_page = get_page(new_books_url) soon_page = get_page(soon_books_url) new_content = web(new_page, "html.parser") soon_content = web(soon_page, "html.parser") new_books = books_titles(new_content) soon_books = books_titles(soon_content) return { "new": new_books, "soon": soon_books, }
def get_books(): new_books_url = "http://www.williamspublishing.com/indexns.shtml" new_page = get_page(new_books_url) new_content = web(new_page, "html.parser") new_books = books_titles(new_content) return { "new": new_books, }
def get_books(): new_books_url = "http://www.bhv.ru/books/new.php" new_page = get_page(new_books_url) new_content = web(new_page, "html.parser") new_books = books_titles(new_content)[:15] return { "new": new_books, }
def grab(self, url): content = web( self.get_content(url).replace(" ", ""), "html.parser") obj = {} obj["title"] = content.select(".b-topic__title")[0].text.strip() obj["image"] = content.select(".g-picture")[0]["src"] obj["content"] = list( map(lambda x: x.text, content.select(".b-text > p"))) return obj
def get_parsed_course(page): course = {} page_content = web(page, "html.parser") course["title"] = get_title(page_content) course["start_date"] = get_start_date(page_content) course["week_count"] = get_week_count(page_content) course["avg_rating"] = get_rating(page_content) course["language"] = get_language(page_content) return course
def main(): url = "https://dmkpress.com/" page = get_page(url) content = web(page, "html.parser") new_books = new_books_titles(content) coming = coming_titles(content) print_books = book_print(content) print_titles("=== new ===", new_books) print_titles("=== coming === ", coming) print_titles("=== printed ===", print_books)
def get_books(): url = "https://dmkpress.com/" page = get_page(url) content = web(page, "html.parser") print_books = book_print(content) coming = coming_titles(content) return { "coming": coming, "printing": print_books, }
def grab(self, url): content = web(self.get_content(url, self.site_encoding), "html.parser") obj = {} if content.select(".article_name"): obj["title"] = content.select(".article_name")[0].text if content.select(".article_text_wrapper > p"): obj["content"] = list( map(lambda x: x.text, content.select(".article_text_wrapper > p"))) return obj
def books_scraping(): response = requests.get("http://bhv.ru") if response.status_code == 200: response.encoding = "cp1251" content = web(response.text, "html.parser") books = content.select("#pageRightColumn > .infoBlock > .bookInfo") print("===Новинки издательства===\n") for book in books: author = book.select(".bookAuthor")[0].getText() title = book.select(".bookTitle > a")[0].getText() print(author, "->", title)
def grab(self, url): content = web(self.get_content(url), "html.parser") obj = {} obj["title"] = content.select( ".b-material-before-body__data > h1" )[0].text obj["image"] = content.select( ".b-material-incut-m-image > img" )[0]["src"] obj["content"] = list( map( lambda x: x.text, content.select(".js-mediator-article > p") ) ) return obj
def grab(self, url): content = web(self.get_content(url, self.site_encoding), "html.parser") obj = {} if content.select(".textMTitle")[0]: obj["title"] = content.select(".textMTitle")[0].text if content.select(".inner > img"): obj["image"] = "{}{}".format( "https://www.interfax.ru/", content.select(".inner > img")[0]["src"] ) print(len(content.select(".at > article > p"))) if content.select(".at > article > p"): obj["content"] = list( map( lambda x: x.text, content.select(".at > article > p") ) ) return obj
def parse_page(page, handler): content = web(page, "html.parser") return handler(content)
def main(): new_books_url = "http://www.bhv.ru/books/new.php" new_page = get_page(new_books_url) new_content = web(new_page, "html.parser") new_books = books_titles(new_content) print_titles("=== new === ", new_books)
# -*- coding: utf-8 -*- """ Destiny2 Weekly Reset Scraper @author: MadMerlyn """ from urllib.request import urlopen import sys from bs4 import BeautifulSoup as web #Load LiveEvents submenu ORIGINAL = sys.stdout HTML = urlopen('https://www.bungie.net/en/Explore/Category?category=LiveEvents') SOUP = web(HTML, 'lxml') #Grab all event links and crawl them LINKS = SOUP.find_all('a', {'class':'explore-item'}) CRAWL = ['https://www.bungie.net'+x.get('href') for x in LINKS] HTML2 = [urlopen(x) for x in CRAWL] ACTIVITIES = [web(x, 'lxml') for x in HTML2] for item in ACTIVITIES: act = item.find('div', {'id':'explore-container'}) if 'nightfall' in act.get_text().lower(): event_dates = act.find('div', {'destiny-event-date-range'}).get_text().strip() div = act.find('div', {'data-identifier':'quest-activity-information'}) title = div.find('div', {'class':'title'}).get_text() subtitle = div.find('div', {'class':'subtitle'}).get_text() div = act.findAll('div', {'data-identifier':'modifier-information'}) mods = [x.find('div', {'class':'title'}).get_text() for x in div] mod_descs = [x.find('div', {'class':'subtitle'}).get_text() for x in div]
def main(): new_books_url = "http://www.williamspublishing.com/indexns.shtml" new_page = get_page(new_books_url) new_content = web(new_page, "html.parser") new_books = books_titles(new_content) print_titles("=== new === ", new_books)