Exemple #1
0
def edp_scrape():
    edp_urls = [
        'https://www.edp-open.org/books-in-french',
        'https://www.edp-open.org/books-in-english',
    ]
    for url in edp_urls:
        scrapers = multiscrape(url, scraper_class=EDPMultiScraper)
        add_from_bookdatas(scrapers)
Exemple #2
0
def load_ku(ku_round=None):
    rounds = [ku_round] if ku_round else ku_rounds
    editions = []
    for around in rounds:
        ku_url = 'https://app.knowledgeunlatched.org/api/rounds/{}/submissions.xml'.format(
            around)
        scrapers = multiscrape(ku_url, scraper_class=KUMultiScraper)
        editions.extend(add_from_bookdatas(scrapers))
    return editions
Exemple #3
0
def load_routledge():
    search_url = "https://www.routledge.com/collections/11526"

    def get_collections(url):
        try:
            response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
            if response.status_code == 200:
                doc = BeautifulSoup(response.content, 'lxml')
                for link in doc.find_all('a', href=re.compile('collections/11526/')):
                    yield (link.text, "https://www.routledge.com/" + link['href'])
        except requests.exceptions.ConnectionError:
            print('couldn\'t connect to %s' % search_url)

    def get_coll_books(url):
        try:
            response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
            if response.status_code == 200:
                doc = BeautifulSoup(response.content, 'lxml')
                for link in doc.select('.media-title a'):
                    yield link['href']
        except requests.exceptions.ConnectionError:
            print('couldn\'t connect to %s' % url)
    
    books = {}
    for (subject, coll_url) in get_collections(search_url):
        print(subject)
        for book_url in get_coll_books(coll_url):
            if not book_url in books:
                print(book_url)
                new_book = RoutledgeScraper(book_url)
                new_book.metadata['subjects'].append(subject)
                books[book_url] = new_book
            else:
                books[book_url].metadata['subjects'].append(subject)
    print("Harvesting %s books" % len(list[books.values()]))
    add_from_bookdatas(books.values())
    return books
    
Exemple #4
0
def load_springer(startpage=1, endpage=None):
    def springer_open_books(startpage, endpage):
        endpage = endpage if endpage else startpage + 10
        for page in range(startpage, endpage + 1):
            url = search_url.format(page)
            try:
                response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
                if response.status_code == 200:
                    base = response.url
                    doc = BeautifulSoup(response.content, 'lxml')
                    for link in doc.select('a.title'):
                        book_url = urljoin(base, link['href'])
                        yield SpringerScraper(book_url)
            except requests.exceptions.ConnectionError:
                print 'couldn\'t connect to %s' % url
    return add_from_bookdatas(springer_open_books(startpage, endpage))
Exemple #5
0
def add_by_sitemap(url, maxnum=None):
    return add_from_bookdatas(scrape_sitemap(url, maxnum=maxnum))