def edp_scrape(): edp_urls = [ 'https://www.edp-open.org/books-in-french', 'https://www.edp-open.org/books-in-english', ] for url in edp_urls: scrapers = multiscrape(url, scraper_class=EDPMultiScraper) add_from_bookdatas(scrapers)
def load_ku(ku_round=None): rounds = [ku_round] if ku_round else ku_rounds editions = [] for around in rounds: ku_url = 'https://app.knowledgeunlatched.org/api/rounds/{}/submissions.xml'.format( around) scrapers = multiscrape(ku_url, scraper_class=KUMultiScraper) editions.extend(add_from_bookdatas(scrapers)) return editions
def load_routledge(): search_url = "https://www.routledge.com/collections/11526" def get_collections(url): try: response = requests.get(url, headers={"User-Agent": settings.USER_AGENT}) if response.status_code == 200: doc = BeautifulSoup(response.content, 'lxml') for link in doc.find_all('a', href=re.compile('collections/11526/')): yield (link.text, "https://www.routledge.com/" + link['href']) except requests.exceptions.ConnectionError: print('couldn\'t connect to %s' % search_url) def get_coll_books(url): try: response = requests.get(url, headers={"User-Agent": settings.USER_AGENT}) if response.status_code == 200: doc = BeautifulSoup(response.content, 'lxml') for link in doc.select('.media-title a'): yield link['href'] except requests.exceptions.ConnectionError: print('couldn\'t connect to %s' % url) books = {} for (subject, coll_url) in get_collections(search_url): print(subject) for book_url in get_coll_books(coll_url): if not book_url in books: print(book_url) new_book = RoutledgeScraper(book_url) new_book.metadata['subjects'].append(subject) books[book_url] = new_book else: books[book_url].metadata['subjects'].append(subject) print("Harvesting %s books" % len(list[books.values()])) add_from_bookdatas(books.values()) return books
def load_springer(startpage=1, endpage=None): def springer_open_books(startpage, endpage): endpage = endpage if endpage else startpage + 10 for page in range(startpage, endpage + 1): url = search_url.format(page) try: response = requests.get(url, headers={"User-Agent": settings.USER_AGENT}) if response.status_code == 200: base = response.url doc = BeautifulSoup(response.content, 'lxml') for link in doc.select('a.title'): book_url = urljoin(base, link['href']) yield SpringerScraper(book_url) except requests.exceptions.ConnectionError: print 'couldn\'t connect to %s' % url return add_from_bookdatas(springer_open_books(startpage, endpage))
def add_by_sitemap(url, maxnum=None): return add_from_bookdatas(scrape_sitemap(url, maxnum=maxnum))