def construct_channel(self, *args, **kwargs): """ Creates ChannelNode and build topic tree Args: - args: arguments passed in during upload_channel (currently None) - kwargs: extra argumens and options not handled by `uploadchannel`. For example, add the command line option lang="fr" and the string "fr" will be passed along to `construct_channel` as kwargs['lang']. Returns: ChannelNode """ channel = self.get_channel( *args, **kwargs) # Create ChannelNode from data in self.channel_info # Parse the index page to get the topics resp = downloader.make_request( "http://proyectodescartes.org/descartescms/") soup = BeautifulSoup(resp.content, "html.parser") topics = soup.find_all("a", "item") final_topics = self.parse_topics(topics, channel) for topic in final_topics: self.download_subject(topic[0], topic[1], topic[2]) raise_for_invalid_channel( channel) # Check for errors in channel construction return channel
def download_subject(self, subject, link, parent): """ Parse each subject page. """ LOGGER.info("Processing subject: {}".format(subject.title)) # No need to parse the content under the subject when link is not valid if "javascript:void(0);" in link: parent.add_child(subject) return # Parse each subject's index page resp = downloader.make_request(link) soup = BeautifulSoup(resp.content, "html.parser") selected_category = soup.find("option", { "class": "level0", "selected": "selected" }) if not selected_category: return parent.add_child(subject) for item in AGE_RANGE.keys(): params = OrderedDict([("category", selected_category["value"]), ("moduleId", "282"), ("format", "count")]) for index in range(len(AGE_RANGE[item])): params["taga[{}]".format(index)] = AGE_RANGE[item][index] # Parse the topics of age range under each subject resp = downloader.make_request("{}/itemlist/filter".format(link), params=params) count = int(resp.text.split('\n')[0]) if count == 0: continue LOGGER.info("Processing topic: {}".format(item)) age_topic = TopicNode(source_id=item, title=item) subject.add_child(age_topic) total_pages = ceil(count / 20) for i in range(total_pages): page_params = OrderedDict(params) LOGGER.info("Processing page: {}".format(i)) self.download_content(age_topic, link, page_params, selected_category["value"], i * 20)
def download_content(self, parent, link, params, selected_category, start): """ Parse each content page. """ params["start"] = start params.pop("format") # Parse each page of the result resp = downloader.make_request("{}/itemlist/filter".format(link), params=params) soup = BeautifulSoup(resp.content, "html.parser") # Find the all the content in each page for item in soup.find("tbody").find_all("a"): content_url = "http://proyectodescartes.org{}".format(item["href"]) title = item.text.strip() source_id = item["href"].split("/")[-1] # Parse each content's page response = downloader.make_request(content_url) page = BeautifulSoup(response.content, "html.parser") thumbnail_url = "http://proyectodescartes.org{}".format( page.find("div", class_="itemFullText").find("img")["src"]) author = self.get_content_author(page) zip_path = self.get_content_zip(page) if not zip_path: LOGGER.info( "The url for the zip file does not exist in this page: {}". format(content_url)) continue content_node = HTML5AppNode( source_id=source_id, title=title, license=CC_BY_NC_SALicense( copyright_holder="Proyecto Descartes"), language=CHANNEL_LANGUAGE, files=[files.HTMLZipFile(zip_path)], author=author, thumbnail=thumbnail_url, ) parent.add_child(content_node)
def books_for_each_category(category): """ Get all the books for every category Parameters: * category - The name of the category that is related to the books """ LOGGER.info("\tCrawling books for {}......\n".format(category)) # Get the json file of the page and parse it payload = {"page": 1, "per_page": 24, "categories[]": category} response = downloader.make_request(BOOK_SEARCH_URL, params=payload, clear_cookies=False) data = response.json() total_pages = data["metadata"]["totalPages"] LOGGER.info("\tThere are in total {} pages for {}......\n".format( total_pages, category)) # List of books for the first page booklist = get_books_from_results(data["data"]) # get the rest of the pages' books for i in range(1, total_pages): payload["page"] = i + 1 response = downloader.make_request(BOOK_SEARCH_URL, params=payload, clear_cookies=False) # Skip the page if there is an error (usually a 500 error) if response.status_code != 200: continue data = response.json() booklist += get_books_from_results(data["data"]) LOGGER.info( "\tFinished getting all the books for {}\n\t================\n".format( category)) return booklist
def download_all(): """ Parse the json returned by StoryWeaver API and generate a dictionary that contains all the information regarding category, publisher, language, level and book. """ resp = downloader.make_request(FILTERS_URL, clear_cookies=False).json() categories = [ item["name"] for item in resp["data"]["category"]["queryValues"] ] channel_tree = {} for category in categories: channel_tree[category] = {} booklist = books_for_each_category(category) # Reset Storyweaver Community number and index for each category storyweaver_community_num = 0 index = 1 for book in booklist: publisher = book["publisher"] language = book["language"] level = book["level"] if publisher == "StoryWeaver Community": storyweaver_community_num += 1 # Make sure we only have 20 books in one Storyweaver Community folder if storyweaver_community_num > 20: index += 1 storyweaver_community_num = 1 publisher = "{}-{}".format(publisher, index) if publisher in channel_tree[category]: if language in channel_tree[category][publisher]: if level in channel_tree[category][publisher][language]: channel_tree[category][publisher][language][ level].append(book) else: channel_tree[category][publisher][language][level] = [ book ] else: channel_tree[category][publisher][language] = {} channel_tree[category][publisher][language][level] = [book] else: channel_tree[category][publisher] = {} channel_tree[category][publisher][language] = {} channel_tree[category][publisher][language][level] = [book] return channel_tree
def get_content_zip(self, page): """ Get the zip path of the content. """ # Find the zip url of the content and check if it's valid. zip_href = page.find("a", href=re.compile(".zip")) if not zip_href: return None zip_url = "http://proyectodescartes.org{}".format(zip_href["href"]) zip_resp = downloader.make_request(zip_url) if zip_resp.status_code != 200: return None filepath = "/tmp/{}".format(zip_url.split("/")[-1]) with open(filepath, "wb") as f: f.write(zip_resp.content) dst = tempfile.mkdtemp() html_name = page.find( "div", class_="itemFullText").find("a")["href"].split("/")[-1] # Unzip the downloaded zip file and zip the folder again. In case that # index.html does not exist on the top most level, rename the index page # in the folder to index.html before zipping the folder again. with zipfile.ZipFile(filepath) as zf: extracted_src = unquote(filepath.split("/")[-1].split(".zip")[0]) zf.extractall(dst) if html_name != "index.html": src_index = os.path.join(dst, extracted_src, html_name) dst_index = src_index.replace(html_name, "index.html") if os.path.exists(src_index): os.rename(src_index, dst_index) zip_path = create_predictable_zip(os.path.join(dst, extracted_src)) return zip_path
def add_node_document(booklist, level_topic, as_booklist): """ Add books as DocumentNode under a specific level of reading. Parameters: * booklist - The list of books to be added as DocumentNodes * level_topic - The TopicNode regarding current level that the DocumentNodes will be attached to * as_booklist - The list of books from African Storybooks """ for item in booklist: # Initialize the source domain and content_id domain = uuid.uuid5(uuid.NAMESPACE_DNS, "storyweaver.org.in") book_id = str(item["source_id"]) # If the publisher is AS and the book is found, # then change the source_domain and content_id if item["publisher"] == "African Storybook Initiative": check = check_if_story_in_AS(as_booklist, item["title"]) if check[0]: domain = uuid.uuid5(uuid.NAMESPACE_DNS, "www.africanstorybook.org") book_id = check[1] # Given that StoryWeaver provides the link to a zip file, # we will download the zip file and extract the pdf file from it with tempfile.NamedTemporaryFile(suffix=".zip") as tempf: try: resp = downloader.make_request(item["link"], clear_cookies=False) resp.raise_for_status() tempf.write(resp.content) except Exception as e: # Do not create the node if download fails LOGGER.info("Error: {} when downloading {}".format( e, item["link"])) continue filename = "" with zipfile.ZipFile(tempf.name, "r") as f: for zipped_file in f.namelist(): if os.path.splitext(zipped_file)[1][1:] == "pdf": tempdir = os.path.dirname(tempf.name) f.extract(zipped_file, path=tempdir) filename = os.path.join(tempdir, zipped_file) break # If no pdf file has been found in the zip, do not create the node if not filename: continue # Create the document node with given information document_file = DocumentFile(path=filename) language_obj = getlang_by_name(item["language"]) book = DocumentNode( title=item["title"], source_id=book_id, author=item["author"], provider=item["publisher"], files=[document_file], license=get_license(licenses.CC_BY, copyright_holder="StoryWeaver"), thumbnail=item.get("thumbnail"), description=item["description"], domain_ns=domain, language=language_obj, ) level_topic.add_child(book)