def generate_download_page(url, zipper):
    """ Create a page for files that are meant to be downloaded (e.g. worksheets)
        Args:
            url (str): url to file that is meant to be downloaded
            zipper (html_writer): where to write download page to
        Returns path to page in zipfile (str)
    """
    # Get template soup
    soup = BeautifulSoup("", "html.parser")
    with open('download.html', 'rb') as templatecode:
        newpage = BeautifulSoup(templatecode.read(), 'html5lib')

    # Determine if link is one of the recognized file types
    download_url = url.split("?")[0]
    filename = download_url.split("/")[-1]
    if download_url.endswith('pdf'):
        render_tag = soup.new_tag('embed')
    elif next(
        (e for e in IMAGE_EXTENSIONS if download_url.lower().endswith(e)),
            None):
        render_tag = soup.new_tag('img')
    else:
        LOGGER.error("Unknown file type found at {}".format(download_url))
        return ""

    # Add tag to new page and write page to zip
    render_tag['src'] = zipper.write_url(format_url(download_url), filename)
    newpage.body.append(render_tag)
    return zipper.write_contents(
        filename.split('.')[0] + ".html", newpage.prettify())
def get_parsed_html_from_url(url, *args, **kwargs):
    response = sess.get(url, *args, **kwargs)
    if response.status_code != 200:
        LOGGER.error("STATUS: {}, URL: {}", response.status_code, url)
    elif not response.from_cache:
        LOGGER.debug("NOT CACHED:", url)
    return BeautifulSoup(response.content, "html.parser")
Example #3
0
def website_game_webresouce_to_ricecooker_node(lang, web_resource):
    """
    Create Ricecooker Json structure for game from web resource dict `web_resource`.
    """
    game_node = dict(
        kind=content_kinds.HTML5,
        source_id=web_resource['source_id'],
        language=lang,
        title=web_resource['title'],
        description='source_url=' + web_resource['url'] if DEBUG_MODE else '',
        license=PRADIGI_LICENSE,
        thumbnail=web_resource.get('thumbnail_url'),
        files=[],
    )
    zip_tmp_path = get_zip_file(web_resource['url'], web_resource['main_file'])
    if zip_tmp_path:
        zip_file = dict(
            file_type=file_types.HTML5,
            path=zip_tmp_path,
            language=lang,
        )
        game_node['files'].append(zip_file)
        LOGGER.debug('Created HTML5AppNode for game ' + web_resource['title'])
        return game_node
    else:
        LOGGER.error('Failed to create zip for game at url=' +
                     web_resource['url'])
        return None
def scrape_resource(url, topic):
    resource = BeautifulSoup(downloader.read(url), 'html5lib')
    LOGGER.info('      {}'.format(resource.find('h2').text))

    filepath = download_resource(resource.find('div', {'class': 'decargas'}).find('a')['href'])
    license = None
    author = ''
    for data_section in resource.find('div', {'class': 'datos_generales'}).find_all('h4'):
        if 'Licencia' in data_section.text:
            try:
                license = LICENSE_MAP[data_section.find_next_sibling('p').text](copyright_holder="Ceibal")
            except KeyError as e:
                LOGGER.error(str(e))
                license = licenses.CC_BYLicense
        elif 'Autor' in data_section.text:
            author = data_section.find_next_sibling('p').text
    if filepath:
        thumbnail = resource.find('div', {'class': 'img-recurso'}).find('img')['src']
        if thumbnail.endswith('.gif'):
            thumbnail = os.path.sep.join([DOWNLOAD_DIRECTORY, thumbnail.split('/')[-1].replace('.gif', '.png')])
            with open(thumbnail, 'wb') as fobj:
                fobj.write(downloader.read(resource.find('div', {'class': 'img-recurso'}).find('img')['src']))

        topic.add_child(nodes.HTML5AppNode(
            title=resource.find('h2').text,
            source_id=url,
            license=license,
            author=author,
            description=resource.find('form').find_all('p')[1].text,
            thumbnail=thumbnail,
            tags = [tag.text[:30] for tag in resource.find_all('a', {'class': 'tags'})],
            files=[files.HTMLZipFile(path=filepath)],
        ))
Example #5
0
 def make_request(self, url, timeout=60, *args, method='GET', **kwargs):
     """
     Failure-resistant HTTP GET/HEAD request helper method.
     """
     retry_count = 0
     max_retries = 5
     while True:
         try:
             response = self.SESSION.request(method,
                                             url,
                                             *args,
                                             timeout=timeout,
                                             **kwargs)
             break
         except (requests.exceptions.ConnectionError,
                 requests.exceptions.ReadTimeout) as e:
             retry_count += 1
             LOGGER.warning(
                 "Connection error ('{msg}'); about to perform retry {count} of {trymax}."
                 .format(msg=str(e), count=retry_count, trymax=max_retries))
             time.sleep(retry_count * 1)
             if retry_count >= max_retries:
                 LOGGER.error("FAILED TO RETRIEVE:" + str(url))
                 return None
     if response.status_code != 200:
         LOGGER.error("ERROR " + str(response.status_code) +
                      ' when getting url=' + url)
         return None
     return response
Example #6
0
 def to_tag(self, filename=None):
     try:
         img = self.create_tag('img')
         img['src'] = self.to_zip(filename=filename)
         return img
     except BROKEN_EXCEPTIONS as e:
         LOGGER.error(str(e))
         return self.create_broken_link_message(self.url)
def download_resource(endpoint):
    try:
        url = '{}{}'.format(BASE_URL, endpoint.lstrip('/'))
        filename, ext = os.path.splitext(endpoint)
        filename = '{}.zip'.format(filename.lstrip('/').replace('/', '-'))
        write_to_path = CeibalPageScraper(url, locale='es').to_file(filename=filename, directory=DOWNLOAD_DIRECTORY)
        return write_to_path
    except Exception as e:
        LOGGER.error(str(e))
    def _download_file(self, write_to_path):

        with html_writer.HTMLWriter(write_to_path) as zipper:
            try:
                self.zipper = zipper
                self.to_zip(filename='index.html')
            except Exception as e:
                # Any errors here will just say index.html file does not exist, so
                # print out error for more descriptive debugging
                LOGGER.error(str(e))
Example #9
0
 def to_tag(self, filename=None):
     try:
         embed = self.create_tag('embed')
         embed['src'] = self.to_zip(filename=filename)
         embed['width'] = '100%'
         embed['style'] = 'height: 500px;max-height: 100vh;'
         return embed
     except BROKEN_EXCEPTIONS as e:
         LOGGER.error(str(e))
         return self.create_broken_link_message(self.url)
Example #10
0
def transform_video_vertical(vertical, parent_title=None):
    if 'children' not in vertical:
        return None, []

    # 1. LOOK FOR AN OPTIONAL html PREFIX TO USE AS DESCRIPTION
    description = ''
    # Extract an optional description from the first html node
    first_child = vertical['children'][0]
    if first_child['kind'] == 'html':
        description = extract_text_from_html_item(first_child,
                                                  translate_from='ar')

    if parent_title:
        video_title = parent_title + ' ' + vertical['display_name']
    else:
        video_title = vertical['display_name']

    # 2. GET THE VIDEO
    videos = [ch for ch in vertical['children'] if ch['kind'] == 'video']
    assert len(videos) == 1, 'multiple videos found'
    video = videos[0]
    video_dict = dict(kind=content_kinds.VIDEO,
                      source_id=video.get('youtube_id') or video.get('path'),
                      title=video_title,
                      author='Edraak',
                      description=description,
                      language=getlang('ar').code,
                      license=EDRAAK_LICENSE,
                      files=[])
    if 'youtube_id' in video:
        file_dict = dict(
            file_type=content_kinds.VIDEO,
            youtube_id=video['youtube_id'],
            language=getlang('ar').code,
            high_resolution=False,
        )
    elif 'path' in video:
        file_dict = dict(
            file_type=content_kinds.VIDEO,
            path=video['path'],
            language=getlang('ar').code,
            ffmpeg_settings={"crf": 24},
        )
    else:
        LOGGER.error('Video does not have youtube_id or path ' + str(video))
    video_dict['files'].append(file_dict)

    # 3. LOOK FOR AN OPTIONAL RESOURCES html
    downloadable_resources = []
    htmls = [ch for ch in vertical['children'] if ch['kind'] == 'html']
    for html in htmls:
        if 'downloadable_resources' in html:
            downloadable_resources.extend(html['downloadable_resources'])

    return video_dict, downloadable_resources
Example #11
0
 def to_tag(self, filename=None):
     try:
         audio = self.create_tag('audio')
         audio['controls'] = 'controls'
         audio['style'] = 'width: 100%;'
         source = self.create_tag('source')
         source['src'] = self.to_zip(filename=filename)
         audio.append(source)
         return audio
     except BROKEN_EXCEPTIONS as e:
         LOGGER.error(str(e))
         return self.create_broken_link_message(self.url)
Example #12
0
 def to_tag(self, filename=None):
     try:
         video = self.create_tag('video')
         video['controls'] = 'controls'
         video['style'] = 'width: 100%;'
         video['preload'] = 'auto'
         source = self.create_tag('source')
         source['src'] = self.to_zip(filename=filename)
         video.append(source)
         return video
     except BROKEN_EXCEPTIONS as e:
         LOGGER.error(str(e))
         return self.create_broken_link_message(self.url)
 def on_story_resource_page(self, url, page, context):
     LOGGER.debug('     in on_story_resource_page' + url)
     html = str(page)
     story_resource_url = get_respath_url_from_html(html)
     if story_resource_url:
         page_dict = dict(
             url=story_resource_url,
             children=[],
         )
         page_dict.update(context)
         context['parent']['children'].append(page_dict)
     else:
         LOGGER.error('Failed to find story_resource_url on page %s' % url)
def save_book(book_detail, channel):
    book_id = book_detail["id"]
    book_source_id = get_book_source_id(book_id)
    book_title = book_detail["name"]
    level_id = book_detail["readingLevel"]
    language = book_detail["language"]
    language_id = language["id"]
    tags = book_detail["tags"]
    epub_url = book_detail["epubUrl"]
    pdf_urls = book_detail["pdfUrl"]
    pdf_portrait_url = pdf_urls.get("portraitUrl", "") if pdf_urls else ""
    pdf_landscape_url = pdf_urls.get("landscapeUrl", "") if pdf_urls else ""
    pdf_booklet_url = pdf_urls.get("bookletUrl", "") if pdf_urls else ""
    pdf_url = pdf_portrait_url or pdf_landscape_url or pdf_booklet_url

    if not pdf_url and not epub_url:
        LOGGER.error("No file found for \n {}".format(book_source_id))
        raise NoFileAvailableError()

    book_files = []
    if pdf_url:
        pdf_file = files.DocumentFile(path=pdf_url)
        book_files.append(pdf_file)
    if epub_url:
        epub_file = files.EPubFile(path=epub_url)
        book_files.append(epub_file)

    book = nodes.DocumentNode(
        source_id=book_source_id,
        title=book_title,
        license=licenses.
        PUBLIC_DOMAIN,  # TODO: get a real license and copyright holder
        files=book_files)

    language_topic = get_or_create_language_topic(language, channel)
    level_topic = get_or_create_level_topic(level_id, language_id,
                                            language_topic)

    if not tags:
        level_topic.add_child(book)
        return

    for tag in tags:
        tag_topic = get_or_create_tag_topic(tag, language_id, level_id,
                                            level_topic)
        tag_topic.add_child(book)
    def on_story_page(self, url, page, context):
        LOGGER.debug('     in on_story_page' + url)
        page_dict = dict(
            kind='story_page',
            url=url,
            children=[],
        )
        page_dict.update(context)
        context['parent']['children'].append(page_dict)

        try:
            body_row = page.find('div', {'id': 'body-row'})
            contents_row = body_row.find('div', {'class': 'row'})
        except Exception as e:
            LOGGER.error('ERROR on_story_page: %s : %s' % (e, url))
            return
        contents = contents_row.find_all('div', {'class': 'col-md-3'})

        for content in contents:
            try:
                title = get_text(content.find('div', {'class': 'txtline'}))
                # TODO: description
                thumbnail = content.find('a').find('img')['src']
                thumbnail = get_absolute_path(thumbnail)

                # get_fun_content_link
                link = content.find('a')
                source_id = link['href'][1:]
                story_resource_url = get_absolute_path(link['href'])

                if self.should_ignore_url(story_resource_url):
                    print('ignoring story content', title, story_resource_url)
                    continue

                LOGGER.debug('      story_resource_page: %s: %s' % (source_id, title))
                context = dict(
                    parent = page_dict,
                    kind='story_resource_page',
                    title=title,
                    source_id=source_id,
                    thumbnail_url=thumbnail,
                )
                self.enqueue_url_and_context(story_resource_url, context)

            except Exception as e:
                LOGGER.error('on_story_page: %s : %s' % (e, content))
def scrape_video_collection(url, topic):
    """ Scrape videos under video collection and add to the topic node
        Args:
            url (str): url to video page (e.g. https://www.exploratorium.edu/video/inflatable-jimmy-kuehnle)
            topic (TopicNode): topic to add video nodes to
    """
    try:
        collection_contents = BeautifulSoup(read(url), 'html5lib')
        for result in collection_contents.find_all('div',
                                                   {'class': 'search-result'}):
            header = result.find('div',
                                 {'class': 'views-field-field-html-title'})
            LOGGER.info("            {}".format(header.text.strip()))

            # Get video from given url
            description = result.find('div', {'class': 'search-description'})
            video_contents = BeautifulSoup(read(header.find('a')['href']),
                                           'html.parser')
            for k, v in get_brightcove_mapping(video_contents).items():
                video_node = nodes.VideoNode(
                    source_id=k,
                    title=header.text.strip().replace("’", "'"),
                    description=description.text.strip()
                    if description else "",
                    license=LICENSE,
                    copyright_holder=COPYRIGHT_HOLDER,
                    author=v.get('author') or "",
                    files=[
                        files.WebVideoFile(v['url'], high_resolution=False)
                    ],
                    thumbnail=get_thumbnail_url(result.find('img')['src']),
                )

                # If video doesn't already exist here, add to topic
                if not next((c for c in topic.children
                             if c.source_id == video_node.source_id), None):
                    topic.add_child(video_node)

        # Scrape next page (if any)
        next_page_url = get_next_page_url(collection_contents)
        if next_page_url:
            scrape_video_collection(next_page_url, topic)

    except requests.exceptions.HTTPError:
        LOGGER.error("Could not read collection at {}".format(url))
def get_phet_zip_file(zip_file_url, main_file_and_query):
    """
    Phet simulations are provided in the zip file `phet.zip`, and the entry point
    is passed as a GET parameter in `main_file_and_query`. To make these compatible
    with Kolibri's default behaviour of loading index.html, we will:
      - Rename index.html to phetindex.thml
      - Add a custom index.html that uses javascrpt redirect to phetindex.thml?{sim_id}
    """
    u = urlparse(main_file_and_query)
    idk, sim_id = u.query.split('=')
    assert idk == 'id', 'unknown query sting format found' + main_file_and_query
    main_file = u.scheme + '://' + u.netloc + u.path  # skip querystring

    destpath = tempfile.mkdtemp()
    LOGGER.info('saving phet zip file in dir ' + destpath)
    try:
        download_file(zip_file_url, destpath, request_fn=make_request)

        zip_filename = zip_file_url.split('/')[-1]
        zip_basename = zip_filename.rsplit('.', 1)[0]
        zip_folder = os.path.join(destpath, zip_basename)

        # Extract zip file contents.
        local_zip_file = os.path.join(destpath, zip_filename)
        with zipfile.ZipFile(local_zip_file) as zf:
            zf.extractall(destpath)

        # Rename main_file to phetindex.html
        main_file = main_file.split('/')[-1]
        src = os.path.join(zip_folder, main_file)
        dest = os.path.join(zip_folder, 'phetindex.html')
        os.rename(src, dest)

        # Create the
        index_html = PHET_INDEX_HTML_TEMPLATE.format(sim_id=sim_id)
        with open(os.path.join(zip_folder, 'index.html'), 'w') as indexf:
            indexf.write(index_html)

        # Always be zipping!
        return create_predictable_zip(zip_folder)

    except Exception as e:
        LOGGER.error("get_phet_zip_file: %s, %s, %s, %s" %
                     (zip_file_url, main_file_and_query, destpath, e))
        return None
Example #18
0
def parse_tsv_file(filepath):
    """
    Load data from the TSV file located at `filepath` using csv.DictReader.
    Returns: a dict {id --> datum} of all the rows.
    """
    print('Loading TSV file', filepath)
    data_by_id = {}
    with open(filepath, encoding="utf-8-sig") as tsvfile:
        reader = csv.DictReader(tsvfile, dialect='excel-tab')
        for row in reader:
            if not row['id']:
                raise ValueError("Row with missing id " + str(row))
            try:
                clean_row = clean_tsv_row(row)
                data_by_id[row['id']] = clean_row
            except json.JSONDecodeError as e:
                LOGGER.error('Failed to parse row=' + str(dict(row)))
    return data_by_id
def get_subtopics(parent, path):
    doc = get_page(path)
    try:
        menu_row = doc.find('div', {'id': 'body-row'})
        menu_row = menu_row.find('div', {'class': 'col-md-2'})
    except Exception as e:
        LOGGER.error('get_subtopics: %s : %s' % (e, doc))
        return
    for subtopic in menu_row.find_all('a'):
        try:
            title = subtopic.get_text().strip()
            source_id = get_source_id(subtopic['href'])
            LOGGER.info('  subtopic: %s: %s' % (source_id, title))
            node = TopicNode(title=title, source_id=source_id)
            parent.add_child(node)
            get_lessons(node, subtopic['href'])
        except Exception as e:
            LOGGER.error('get_subtopics: %s : %s' % (e, subtopic))
    def on_subtopic_page(self, url, page, context):
        LOGGER.debug('     in on_subtopic_page ' + url)
        page_dict = dict(
            kind='subtopic_page',  # redundant...
            url=url,
            children=[],
        )
        page_dict.update(context)
        context['parent']['children'].append(page_dict)

        try:
            menu_row = page.find('div', {'id': 'body-row'})
            menu_row = menu_row.find('div', {'class': 'col-md-9'})
        except Exception as e:
            LOGGER.error('on_subtopic_page: %s : %s' % (e, page))
            return
        for lesson in menu_row.find_all('div', {'class': 'thumbnail'}):
            try:
                title = lesson.find('div', {'class': 'txtline'}).get_text().strip()
                caption = lesson.find('div', class_='caption')
                description = get_text(caption) if caption else ''
                lesson_url = urljoin(url, lesson.find('a')['href'])

                if self.should_ignore_url(lesson_url):
                    LOGGER.info('ignoring lesson' + lesson_url)
                    continue

                thumbnail_src = lesson.find('a').find('img')['src']
                thumbnail_url = urljoin(url, thumbnail_src)
                source_id = get_source_id(lesson.find('a')['href'])
                LOGGER.debug('         lesson: %s: %s' % (source_id, title))
                context = dict(
                    parent=page_dict,
                    kind='lesson_page',
                    title=title,
                    description=description,
                    source_id=source_id,
                    thumbnail_url=thumbnail_url,
                    children=[],
                )
                self.enqueue_url_and_context(lesson_url, context)
                # get_contents(node, link)
            except Exception as e:
                LOGGER.error('on_subtopic_page: %s : %s' % (e, lesson))
Example #21
0
def get_subtitles_using_youtube_dl(youtube_id):
    youtube_url = 'https://youtube.com/watch?v=' + youtube_id
    yt_resource = YouTubeResource(youtube_url)
    lang_codes = []
    try:
        result = yt_resource.get_resource_subtitles()
        # TODO(ivan) Consider including auto-generated subtitles to increase
        #       coverage and handle edge cases of videos that are transalted
        #       but no metadata: https://www.youtube.com/watch?v=qlGjA9p1UAM
        if result:
            for lang_code, lang_subs in result['subtitles'].items():
                for lang_sub in lang_subs:
                    if 'ext' in lang_sub and lang_sub[
                            'ext'] == 'vtt' and lang_code not in lang_codes:
                        lang_codes.append(lang_code)
    except Exception as e:
        LOGGER.error('get_subtitles_using_youtube_dl failed for ' +
                     youtube_url)
        LOGGER.error(str(e))
    return lang_codes
    def on_special_subtopic_page(self, url, page, context):
        LOGGER.debug('     in on_special_subtopic_page ' + url)
        page_dict = dict(
            kind='special_subtopic_page',  # redundant... -- mismatc with original special_subtopic_page
            url=url,
            children=[],
        )
        page_dict.update(context)
        context['parent']['children'].append(page_dict)
        try:
            menu_row = page.find('div', {'id': 'body-row'})
            menu_row = menu_row.find('div', {'class': 'col-md-2'})
            print(str(menu_row))
        except Exception as e:
            LOGGER.error('on_subtopic_page: %s : %s' % (e, page))
            return
        for link in menu_row.find_all('a', {'class': 'list-group-item'}):
            try:
                title = link.get_text().strip()
                description = ''
                lesson_url = urljoin(url, link['href'])

                if self.should_ignore_url(lesson_url):
                    LOGGER.info('ignoring lesson' + lesson_url)
                    continue

                source_id = get_source_id(link['href'])
                LOGGER.debug('         special lesson: %s: %s' % (source_id, title))
                context = dict(
                    parent=page_dict,
                    kind='fun_page',
                    title=title,
                    description=description,
                    source_id=source_id,
                    thumbnail_url=None,
                    children=[],
                )
                self.enqueue_url_and_context(lesson_url, context)
                # get_contents(node, link)
            except Exception as e:
                LOGGER.error('on_special_subtopic_page: %s : %s' % (e, link))
def get_topics(parent, path):
    doc = get_page(path)
    try:
        menu_row = doc.find('div', {'id': 'menu-row'})
    except Exception as e:
        LOGGER.error('get_topics: %s : %s' % (e, doc))
        return
    for topic in menu_row.find_all('a'):
        try:
            if topic['href'] == '#':
                continue
            title = topic.get_text().strip()
            source_id = get_source_id(topic['href'])
            LOGGER.info('topic: %s: %s' % (source_id, title))
            node = TopicNode(title=title, source_id=source_id)
            parent.add_child(node)
            get_subtopics(node, topic['href'])
            if DEBUG_MODE:
                return
        except Exception as e:
            LOGGER.error('get_topics: %s : %s' % (e, topic))
Example #24
0
    def _get_youtube_info(self, use_proxy=True, use_cache=True, options=None):
        youtube_info = None
        # 1. Try to get from cache if allowed:
        if os.path.exists(self.cache_path) and use_cache:
            LOGGER.info("==> [%s] Retrieving cached information...", self.__str__())
            youtube_info = json.load(open(self.cache_path))
        # 2. Fetch info from youtube_dl
        if not youtube_info:
            LOGGER.info("==> [%s] Requesting info from youtube...", self.__str__())
            os.makedirs(self.cache_dir, exist_ok=True)
            try:
                youtube_resource = YouTubeResource(self.url, useproxy=use_proxy)
            except youtube_dl.utils.ExtractorError as e:
                if "unavailable" in str(e):
                    LOGGER.error("==> [%s] Resource unavailable for URL: %s", self.__str__, self.url)
                    return None

            if youtube_resource:
                try:
                    # Save YouTube info to JSON cache file
                    youtube_info = youtube_resource.get_resource_info(options)
                    if youtube_info:
                        json.dump(youtube_info,
                                  open(self.cache_path, 'w'),
                                  indent=4,
                                  ensure_ascii=False,
                                  sort_keys=True)
                    else:
                        LOGGER.error("==> [%s] Failed to extract YouTube info", self.__str__())
                except Exception as e:
                    LOGGER.error("==> [%s] Failed to get YouTube info: %s", self.__str__(), e)
                    return None
        return youtube_info
def download(url, write_to_path, attempts=DOWNLOAD_ATTEMPTS):
    """ Download the web video
        Args:
            url (str): url to video to download
            write_to_path (str): where to write video to
            attempts (int): how many times to reattempt a download
    """
    try:
        video_format = "bestvideo[height<=480][ext=mp4]+bestaudio[ext=m4a]/best[height<=480][ext=mp4]"
        with youtube_dl.YoutubeDL({
                "format": video_format,
                "outtmpl": write_to_path
        }) as ydl:
            ydl.download([url])
    except youtube_dl.utils.DownloadError as e:
        # If there are more attempts, try again. Otherwise, return error
        if attempts > 0:
            download(url, write_to_path, attempts=attempts - 1)
        else:
            LOGGER.error("Could not download video {} ({})".format(
                url, str(e)))
            raise e
    def on_topic_page(self, url, page, context):
        LOGGER.debug('in on_topic_page ' + url)
        page_dict = dict(
            kind='topic_page',
            url=url,
            children=[],
        )
        page_dict.update(context)
        context['parent']['children'].append(page_dict)

        try:
            body_row = page.find('div', {'id': 'body-row'})
            menu_row = body_row.find('div', {'class': 'col-md-2'})
            subtopics = menu_row.find_all('a')
        except Exception as e:
            LOGGER.error('ERROR get_subtopics: %s : %s' % (e, url))
            return
        for subtopic in subtopics:
            try:
                subtopic_url = urljoin(url, subtopic['href'])

                if self.should_ignore_url(subtopic_url):
                    print('ignoring subtopic', subtopic_url)
                    continue

                title = get_text(subtopic)
                source_id = get_source_id(subtopic['href'])
                LOGGER.debug('  found subtopic: %s: %s' % (source_id, title))
                context = dict(
                    parent=page_dict,
                    kind='subtopic_page',
                    title=title,
                    source_id=source_id,
                    children=[],
                )
                self.enqueue_url_and_context(subtopic_url, context)
            except Exception as e:
                LOGGER.error('on_topic_page: %s : %s' % (e, subtopic))
Example #27
0
def get_subtree_by_subject_en(lang, subject):
    if lang not in PRADIGI_LANG_URL_MAP:
        raise ValueError('Language `lang` must be in PRADIGI_LANG_URL_MAP')
    wrt_filename = 'chefdata/trees/pradigi_{}_web_resource_tree.json'.format(
        lang)
    with open(wrt_filename) as jsonfile:
        web_resource_tree = json.load(jsonfile)
    subject_subtrees = web_resource_tree['children']
    try:
        for subject_subtree in subject_subtrees:
            if 'subject_en' in subject_subtree and subject_subtree[
                    'subject_en'] == subject:
                return subject_subtree
            elif 'source_id' in subject_subtree and subject_subtree[
                    'source_id'] == subject:
                return subject_subtree
            else:
                pass
                # print('no subject_en in '+ subject_subtree['source_id'])
    except Exception as e:
        LOGGER.error("in get_subtree_by_subject_en: %s, %s, %s, %s" %
                     (lang, subject, subject_subtree, e))
    return None
def get_lessons(parent, path):
    doc = get_page(path)
    try:
        menu_row = doc.find('div', {'id': 'body-row'})
        menu_row = menu_row.find('div', {'class': 'col-md-9'})
    except Exception as e:
        LOGGER.error('get_lessons: %s : %s' % (e, doc))
        return
    for lesson in menu_row.find_all('div', {'class': 'thumbnail'}):
        try:
            title = lesson.find('div', {'class': 'txtline'}).get_text().strip()
            link = lesson.find('a')['href']
            thumbnail = lesson.find('a').find('img')['src']
            thumbnail = get_absolute_path(thumbnail)
            source_id = get_source_id(link)
            LOGGER.info('    lesson: %s: %s' % (source_id, title))
            node = TopicNode(title=title,
                             source_id=source_id,
                             thumbnail=thumbnail)
            parent.add_child(node)
            get_contents(node, link)
        except Exception as e:
            LOGGER.error('get_lessons: %s : %s' % (e, lesson))
def get_zip_file(zip_file_url, main_file):
    """HTML games are provided as zip files, the entry point of the game is
     main_file. main_file needs to be renamed to index.html to make it
     compatible with Kolibri.
    """
    destpath = tempfile.mkdtemp()
    try:
        download_file(zip_file_url, destpath, request_fn=make_request)

        zip_filename = zip_file_url.split('/')[-1]
        zip_basename = zip_filename.rsplit('.', 1)[0]
        zip_folder = os.path.join(destpath, zip_basename)

        # Extract zip file contents.
        local_zip_file = os.path.join(destpath, zip_filename)
        with zipfile.ZipFile(local_zip_file) as zf:
            zf.extractall(destpath)

        # In some cases, the files are under the www directory,
        # let's move them up one level.
        www_dir = os.path.join(zip_folder, 'www')
        if os.path.isdir(www_dir):
            files = os.listdir(www_dir)
            for f in files:
                shutil.move(os.path.join(www_dir, f), zip_folder)

        # Rename main_file to index.html.
        main_file = main_file.split('/')[-1]
        src = os.path.join(zip_folder, main_file)
        dest = os.path.join(zip_folder, 'index.html')
        os.rename(src, dest)

        return create_predictable_zip(zip_folder)
    except Exception as e:
        LOGGER.error("get_zip_file: %s, %s, %s, %s" %
                     (zip_file_url, main_file, destpath, e))
        return None
def download_zip_file(url):
    if not url:
        return (False, None)

    if get_suffix(url) != '.zip':
        return (False, None)

    response = sess.get(url)
    if response.status_code != 200:
        LOGGER.error("STATUS: {}, URL: {}", response.status_code, url)
        return (False, None)
    elif not response.from_cache:
        LOGGER.debug("NOT CACHED:", url)

    archive = zipfile.ZipFile(io.BytesIO(response.content))
    archive_members = list(
        filter(lambda f: f.filename.endswith('.pdf'), archive.infolist()))
    archive_member_names = [None] * len(archive_members)
    for i, pdf in enumerate(archive_members):
        path = os.path.join(PDFS_DATA_DIR, pdf.filename)
        archive_member_names[i] = path
        if not os.path.exists(path):
            archive.extract(pdf, PDFS_DATA_DIR)
    return (True, archive_member_names)