Beispiel #1
0
    def _find_type_from_folder_like(link_element: Tag,
                                    url: str) -> Optional[IliasElementType]:
        """
        Try crawling something that looks like a folder.
        """
        # pylint: disable=too-many-return-statements

        found_parent: Optional[Tag] = None

        # We look for the outer div of our inner link, to find information around it
        # (mostly the icon)
        for parent in link_element.parents:
            if "ilContainerListItemOuter" in parent[
                    "class"] or "il-std-item" in parent["class"]:
                found_parent = parent
                break

        if found_parent is None:
            _unexpected_html_warning()
            log.warn_contd(
                f"Tried to figure out element type, but did not find an icon for {url}"
            )
            return None

        # Find the small descriptive icon to figure out the type
        img_tag: Optional[Tag] = found_parent.select_one("img.ilListItemIcon")

        if img_tag is None:
            img_tag = found_parent.select_one("img.icon")

        if img_tag is None:
            _unexpected_html_warning()
            log.warn_contd(
                f"Tried to figure out element type, but did not find an image for {url}"
            )
            return None

        if "opencast" in str(img_tag["alt"]).lower():
            return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED

        if str(img_tag["src"]).endswith("icon_exc.svg"):
            return IliasElementType.EXERCISE

        if str(img_tag["src"]).endswith("icon_webr.svg"):
            return IliasElementType.LINK

        if str(img_tag["src"]).endswith("icon_book.svg"):
            return IliasElementType.BOOKING

        if str(img_tag["src"]).endswith("frm.svg"):
            return IliasElementType.FORUM

        if str(img_tag["src"]).endswith("sess.svg"):
            return IliasElementType.MEETING

        if str(img_tag["src"]).endswith("icon_tst.svg"):
            return IliasElementType.TEST

        return IliasElementType.FOLDER
Beispiel #2
0
    def _find_type_from_card(self,
                             card_title: Tag) -> Optional[IliasElementType]:
        def is_card_root(element: Tag) -> bool:
            return "il-card" in element["class"] and "thumbnail" in element[
                "class"]

        card_root: Optional[Tag] = None

        # We look for the card root
        for parent in card_title.parents:
            if is_card_root(parent):
                card_root = parent
                break

        if card_root is None:
            _unexpected_html_warning()
            log.warn_contd(
                f"Tried to figure out element type, but did not find an icon for {card_title}"
            )
            return None

        icon: Tag = card_root.select_one(".il-card-repository-head .icon")

        if "opencast" in icon["class"]:
            return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED
        if "exc" in icon["class"]:
            return IliasElementType.EXERCISE
        if "webr" in icon["class"]:
            return IliasElementType.LINK
        if "book" in icon["class"]:
            return IliasElementType.BOOKING
        if "frm" in icon["class"]:
            return IliasElementType.FORUM
        if "sess" in icon["class"]:
            return IliasElementType.MEETING
        if "tst" in icon["class"]:
            return IliasElementType.TEST
        if "fold" in icon["class"]:
            return IliasElementType.FOLDER

        _unexpected_html_warning()
        log.warn_contd(
            f"Could not extract type from {icon} for card title {card_title}")
        return None
Beispiel #3
0
    def _find_type_from_link(element_name: str, link_element: Tag,
                             url: str) -> Optional[IliasElementType]:
        """
        Decides which sub crawler to use for a given top level element.
        """
        parsed_url = urlparse(url)

        # file URLs contain "target=file"
        if "target=file_" in parsed_url.query:
            return IliasElementType.FILE

        if "target=grp_" in parsed_url.query:
            return IliasElementType.FOLDER

        if "target=crs_" in parsed_url.query:
            return IliasElementType.FOLDER

        if "baseClass=ilExerciseHandlerGUI" in parsed_url.query:
            return IliasElementType.EXERCISE

        if "baseClass=ilLinkResourceHandlerGUI" in parsed_url.query and "calldirectlink" in parsed_url.query:
            return IliasElementType.LINK

        if "cmd=showThreads" in parsed_url.query or "target=frm_" in parsed_url.query:
            return IliasElementType.FORUM

        if "cmdClass=ilobjtestgui" in parsed_url.query:
            return IliasElementType.TEST

        # Booking and Meeting can not be detected based on the link. They do have a ref_id though, so
        # try to guess it from the image.

        # Everything with a ref_id can *probably* be opened to reveal nested things
        # video groups, directories, exercises, etc
        if "ref_id=" in parsed_url.query or "goto.php" in parsed_url.path:
            return IliasPage._find_type_from_folder_like(link_element, url)

        _unexpected_html_warning()
        log.warn_contd(
            f"Tried to figure out element type, but failed for {element_name!r} / {link_element!r})"
        )
        return None
Beispiel #4
0
    def _find_copa_entries(self) -> List[IliasPageElement]:
        items: List[IliasPageElement] = []
        links: List[Tag] = self._soup.findAll(
            class_="ilc_flist_a_FileListItemLink")

        for link in links:
            url = self._abs_url_from_link(link)
            name = _sanitize_path_name(link.getText().strip().replace(
                "\t", ""))

            if "file_id" not in url:
                _unexpected_html_warning()
                log.warn_contd(
                    f"Found unknown content page item {name!r} with url {url!r}"
                )
                continue

            items.append(IliasPageElement(IliasElementType.FILE, url, name))

        return items
Beispiel #5
0
    def _find_personal_desktop_entries(self) -> List[IliasPageElement]:
        items: List[IliasPageElement] = []

        titles: List[Tag] = self._soup.select(".il-item-title")
        for title in titles:
            link = title.find("a")
            name = _sanitize_path_name(link.text.strip())
            url = self._abs_url_from_link(link)

            type = self._find_type_from_link(name, link, url)
            if not type:
                _unexpected_html_warning()
                log.warn_contd(f"Could not extract type for {link}")
                continue

            log.explain(f"Found {name!r}")

            if type == IliasElementType.FILE and "_download" not in url:
                url = re.sub(r"(target=file_\d+)", r"\1_download", url)
                log.explain("Rewired file URL to include download part")

            items.append(IliasPageElement(type, url, name))

        return items
Beispiel #6
0
    def _find_cards(self) -> List[IliasPageElement]:
        result: List[IliasPageElement] = []

        card_titles: List[Tag] = self._soup.select(".card-title a")

        for title in card_titles:
            url = self._abs_url_from_link(title)
            name = _sanitize_path_name(title.getText().strip())
            type = self._find_type_from_card(title)

            if not type:
                _unexpected_html_warning()
                log.warn_contd(f"Could not extract type for {title}")
                continue

            result.append(IliasPageElement(type, url, name))

        card_button_tiles: List[Tag] = self._soup.select(".card-title button")

        for button in card_button_tiles:
            regex = re.compile(button["id"] +
                               r".*window.open\(['\"](.+?)['\"]")
            res = regex.search(str(self._soup))
            if not res:
                _unexpected_html_warning()
                log.warn_contd(
                    f"Could not find click handler target for {button}")
                continue
            url = self._abs_url_from_relative(res.group(1))
            name = _sanitize_path_name(button.getText().strip())
            type = self._find_type_from_card(button)
            caption_parent = button.findParent(
                "div",
                attrs={"class": lambda x: x and "caption" in x},
            )
            description = caption_parent.find_next_sibling(
                "div").getText().strip()

            if not type:
                _unexpected_html_warning()
                log.warn_contd(f"Could not extract type for {button}")
                continue

            result.append(
                IliasPageElement(type, url, name, description=description))

        return result