Ejemplo n.º 1
0
 def _find_exercise_entries(self) -> List[IliasPageElement]:
     if self._soup.find(id="tab_submission"):
         log.explain(
             "Found submission tab. This is an exercise detail page")
         return self._find_exercise_entries_detail_page()
     log.explain("Found no submission tab. This is an exercise root page")
     return self._find_exercise_entries_root_page()
Ejemplo n.º 2
0
    def _file_to_element(self, name: str, url: str,
                         link_element: Tag) -> IliasPageElement:
        # Files have a list of properties (type, modification date, size, etc.)
        # In a series of divs.
        # Find the parent containing all those divs, so we can filter our what we need
        properties_parent: Tag = link_element.findParent(
            "div", {
                "class": lambda x: "il_ContainerListItem" in x
            }).select_one(".il_ItemProperties")
        # The first one is always the filetype
        file_type = properties_parent.select_one(
            "span.il_ItemProperty").getText().strip()

        # The rest does not have a stable order. Grab the whole text and reg-ex the date
        # out of it
        all_properties_text = properties_parent.getText().strip()
        modification_date_match = re.search(
            r"(((\d+\. \w+ \d+)|(Gestern|Yesterday)|(Heute|Today)|(Morgen|Tomorrow)), \d+:\d+)",
            all_properties_text)
        if modification_date_match is None:
            modification_date = None
            log.explain(f"Element {name} at {url} has no date.")
        else:
            modification_date_str = modification_date_match.group(1)
            modification_date = demangle_date(modification_date_str)

        # Grab the name from the link text
        full_path = name + "." + file_type

        log.explain(f"Found file {full_path!r}")
        return IliasPageElement(IliasElementType.FILE, url, full_path,
                                modification_date)
Ejemplo n.º 3
0
    def _find_exercise_entries_detail_page(self) -> List[IliasPageElement]:
        results: List[IliasPageElement] = []

        # Find all download links in the container (this will contain all the files)
        download_links: List[Tag] = self._soup.findAll(
            name="a",
            # download links contain the given command class
            attrs={"href": lambda x: x and "cmd=download" in x},
            text="Download")

        for link in download_links:
            parent_row: Tag = link.findParent("tr")
            children: List[Tag] = parent_row.findChildren("td")

            name = _sanitize_path_name(children[1].getText().strip())
            log.explain(f"Found exercise detail entry {name!r}")

            for child in reversed(children):
                date = demangle_date(child.getText().strip(),
                                     fail_silently=True)
                if date is not None:
                    break
            if date is None:
                log.warn(f"Date parsing failed for exercise entry {name!r}")

            results.append(
                IliasPageElement(IliasElementType.FILE,
                                 self._abs_url_from_link(link), name, date))

        return results
Ejemplo n.º 4
0
    def _find_video_entries_paginated(self) -> List[IliasPageElement]:
        table_element: Tag = self._soup.find(name="table",
                                             id=re.compile(r"tbl_xoct_.+"))

        if table_element is None:
            log.warn(
                "Couldn't increase elements per page (table not found). I might miss elements."
            )
            return self._find_video_entries_no_paging()

        id_match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"])
        if id_match is None:
            log.warn(
                "Couldn't increase elements per page (table id not found). I might miss elements."
            )
            return self._find_video_entries_no_paging()

        table_id = id_match.group(1)

        query_params = {
            f"tbl_xoct_{table_id}_trows": "800",
            "cmd": "asyncGetTableGUI",
            "cmdMode": "asynch"
        }
        url = url_set_query_params(self._page_url, query_params)

        log.explain("Disabled pagination, retrying folder as a new entry")
        return [IliasPageElement(IliasElementType.VIDEO_FOLDER, url, "")]
Ejemplo n.º 5
0
    def _listed_video_to_element(self, link: Tag) -> IliasPageElement:
        # The link is part of a table with multiple columns, describing metadata.
        # 6th or 7th child (1 indexed) is the modification time string. Try to find it
        # by parsing backwards from the end and finding something that looks like a date
        modification_time = None
        row: Tag = link.parent.parent.parent
        column_count = len(row.select("td.std"))
        for index in range(column_count, 0, -1):
            modification_string = link.parent.parent.parent.select_one(
                f"td.std:nth-child({index})").getText().strip()
            if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string):
                modification_time = datetime.strptime(modification_string,
                                                      "%d.%m.%Y - %H:%M")
                break

        if modification_time is None:
            log.warn(f"Could not determine upload time for {link}")
            modification_time = datetime.now()

        title = link.parent.parent.parent.select_one(
            "td.std:nth-child(3)").getText().strip()
        title += ".mp4"

        video_name: str = _sanitize_path_name(title)

        video_url = self._abs_url_from_link(link)

        log.explain(f"Found video {video_name!r} at {video_url}")
        return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url,
                                video_name, modification_time)
Ejemplo n.º 6
0
    def _find_normal_entries(self) -> List[IliasPageElement]:
        result: List[IliasPageElement] = []

        # Fetch all links and throw them to the general interpreter
        links: List[Tag] = self._soup.select("a.il_ContainerItemTitle")

        for link in links:
            abs_url = self._abs_url_from_link(link)
            parents = self._find_upwards_folder_hierarchy(link)

            if parents:
                element_name = "/".join(parents) + "/" + _sanitize_path_name(
                    link.getText())
            else:
                element_name = _sanitize_path_name(link.getText())

            element_type = self._find_type_from_link(element_name, link,
                                                     abs_url)
            description = self._find_link_description(link)

            # The last meeting on every page is expanded by default.
            # Its content is then shown inline *and* in the meeting page itself.
            # We should skip the inline content.
            if element_type != IliasElementType.MEETING and self._is_in_expanded_meeting(
                    link):
                continue

            if not element_type:
                continue
            if element_type == IliasElementType.MEETING:
                normalized = _sanitize_path_name(
                    self._normalize_meeting_name(element_name))
                log.explain(
                    f"Normalized meeting name from {element_name!r} to {normalized!r}"
                )
                element_name = normalized
            elif element_type == IliasElementType.FILE:
                result.append(
                    self._file_to_element(element_name, abs_url, link))
                continue

            log.explain(f"Found {element_name!r}")
            result.append(
                IliasPageElement(element_type,
                                 abs_url,
                                 element_name,
                                 description=description))

        result += self._find_cards()

        return result
Ejemplo n.º 7
0
    def _listed_video_to_element(self, link: Tag) -> IliasPageElement:
        # The link is part of a table with multiple columns, describing metadata.
        # 6th child (1 indexed) is the modification time string
        modification_string = link.parent.parent.parent.select_one(
            "td.std:nth-child(6)").getText().strip()
        modification_time = datetime.strptime(modification_string,
                                              "%d.%m.%Y - %H:%M")

        title = link.parent.parent.parent.select_one(
            "td.std:nth-child(3)").getText().strip()
        title += ".mp4"

        video_name: str = _sanitize_path_name(title)

        video_url = self._abs_url_from_link(link)

        log.explain(f"Found video {video_name!r} at {video_url}")
        return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url,
                                video_name, modification_time)
Ejemplo n.º 8
0
    def _player_to_video(self) -> List[IliasPageElement]:
        # Fetch the actual video page. This is a small wrapper page initializing a javscript
        # player. Sadly we can not execute that JS. The actual video stream url is nowhere
        # on the page, but defined in a JS object inside a script tag, passed to the player
        # library.
        # We do the impossible and RegEx the stream JSON object out of the page's HTML source
        regex = re.compile(r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file",
                           re.IGNORECASE)
        json_match = regex.search(str(self._soup))

        if json_match is None:
            log.warn(
                "Could not find JSON stream info in video player. Ignoring video."
            )
            return []
        json_str = json_match.group(1)

        # parse it
        json_object = json.loads(json_str)
        streams = [stream for stream in json_object["streams"]]

        # and just fetch the lone video url!
        if len(streams) == 1:
            video_url = streams[0]["sources"]["mp4"][0]["src"]
            return [
                IliasPageElement(IliasElementType.VIDEO, video_url,
                                 self._source_name)
            ]

        log.explain(f"Found multiple videos for stream at {self._source_name}")
        items = []
        for stream in sorted(streams, key=lambda stream: stream["content"]):
            full_name = f"{self._source_name.replace('.mp4', '')} ({stream['content']}).mp4"
            video_url = stream["sources"]["mp4"][0]["src"]
            items.append(
                IliasPageElement(IliasElementType.VIDEO, video_url, full_name))

        return items
Ejemplo n.º 9
0
    def _find_video_entries(self) -> List[IliasPageElement]:
        # ILIAS has three stages for video pages
        # 1. The initial dummy page without any videos. This page contains the link to the listing
        # 2. The video listing which might be paginated
        # 3. An unpaginated video listing (or at least one that includes 800 videos)
        #
        # We need to figure out where we are.

        video_element_table: Tag = self._soup.find(
            name="table", id=re.compile(r"tbl_xoct_.+"))

        if video_element_table is None:
            # We are in stage 1
            # The page is actually emtpy but contains the link to stage 2
            content_link: Tag = self._soup.select_one("#tab_series a")
            url: str = self._abs_url_from_link(content_link)
            query_params = {
                "limit": "800",
                "cmd": "asyncGetTableGUI",
                "cmdMode": "asynch"
            }
            url = url_set_query_params(url, query_params)
            log.explain(
                "Found ILIAS video frame page, fetching actual content next")
            return [
                IliasPageElement(IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED,
                                 url, "")
            ]

        is_paginated = self._soup.find(
            id=re.compile(r"tab_page_sel.+")) is not None

        if is_paginated and not self._page_type == IliasElementType.VIDEO_FOLDER:
            # We are in stage 2 - try to break pagination
            return self._find_video_entries_paginated()

        return self._find_video_entries_no_paging()
Ejemplo n.º 10
0
    def _find_personal_desktop_entries(self) -> List[IliasPageElement]:
        items: List[IliasPageElement] = []

        titles: List[Tag] = self._soup.select(".il-item-title")
        for title in titles:
            link = title.find("a")
            name = _sanitize_path_name(link.text.strip())
            url = self._abs_url_from_link(link)

            type = self._find_type_from_link(name, link, url)
            if not type:
                _unexpected_html_warning()
                log.warn_contd(f"Could not extract type for {link}")
                continue

            log.explain(f"Found {name!r}")

            if type == IliasElementType.FILE and "_download" not in url:
                url = re.sub(r"(target=file_\d+)", r"\1_download", url)
                log.explain("Rewired file URL to include download part")

            items.append(IliasPageElement(type, url, name))

        return items
Ejemplo n.º 11
0
 def get_child_elements(self) -> List[IliasPageElement]:
     """
     Return all child page elements you can find here.
     """
     if self._is_video_player():
         log.explain("Page is a video player, extracting URL")
         return self._player_to_video()
     if self._is_video_listing():
         log.explain("Page is a video listing, searching for elements")
         return self._find_video_entries()
     if self._is_exercise_file():
         log.explain("Page is an exercise, searching for elements")
         return self._find_exercise_entries()
     if self._is_personal_desktop():
         log.explain("Page is the personal desktop, searching for elements")
         return self._find_personal_desktop_entries()
     if self._is_content_page():
         log.explain("Page is a content page, searching for elements")
         return self._find_copa_entries()
     log.explain("Page is a normal folder, searching for elements")
     return self._find_normal_entries()
Ejemplo n.º 12
0
    def _find_exercise_entries_root_page(self) -> List[IliasPageElement]:
        results: List[IliasPageElement] = []

        # Each assignment is in an accordion container
        assignment_containers: List[Tag] = self._soup.select(
            ".il_VAccordionInnerContainer")

        for container in assignment_containers:
            # Fetch the container name out of the header to use it in the path
            container_name = container.select_one(
                ".ilAssignmentHeader").getText().strip()
            log.explain(f"Found exercise container {container_name!r}")

            # Find all download links in the container (this will contain all the files)
            files: List[Tag] = container.findAll(
                name="a",
                # download links contain the given command class
                attrs={
                    "href": lambda x: x and "cmdClass=ilexsubmissiongui" in x
                },
                text="Download")

            # Grab each file as you now have the link
            for file_link in files:
                # Two divs, side by side. Left is the name, right is the link ==> get left
                # sibling
                file_name = file_link.parent.findPrevious(
                    name="div").getText().strip()
                file_name = _sanitize_path_name(file_name)
                url = self._abs_url_from_link(file_link)

                log.explain(f"Found exercise entry {file_name!r}")
                results.append(
                    IliasPageElement(
                        IliasElementType.FILE,
                        url,
                        container_name + "/" + file_name,
                        None  # We do not have any timestamp
                    ))

            # Find all links to file listings (e.g. "Submitted Files" for groups)
            file_listings: List[Tag] = container.findAll(
                name="a",
                # download links contain the given command class
                attrs={
                    "href":
                    lambda x: x and "cmdClass=ilexsubmissionfilegui" in x
                })

            # Add each listing as a new
            for listing in file_listings:
                parent_container: Tag = listing.findParent(
                    "div", attrs={"class": lambda x: x and "form-group" in x})
                label_container: Tag = parent_container.find(
                    attrs={"class": lambda x: x and "control-label" in x})
                file_name = _sanitize_path_name(
                    label_container.getText().strip())
                url = self._abs_url_from_link(listing)
                log.explain(f"Found exercise detail {file_name!r} at {url}")
                results.append(
                    IliasPageElement(
                        IliasElementType.EXERCISE_FILES,
                        url,
                        container_name + "/" + file_name,
                        None  # we do not have any timestamp
                    ))

        return results