def _find_exercise_entries(self) -> List[IliasPageElement]: if self._soup.find(id="tab_submission"): log.explain( "Found submission tab. This is an exercise detail page") return self._find_exercise_entries_detail_page() log.explain("Found no submission tab. This is an exercise root page") return self._find_exercise_entries_root_page()
def _file_to_element(self, name: str, url: str, link_element: Tag) -> IliasPageElement: # Files have a list of properties (type, modification date, size, etc.) # In a series of divs. # Find the parent containing all those divs, so we can filter our what we need properties_parent: Tag = link_element.findParent( "div", { "class": lambda x: "il_ContainerListItem" in x }).select_one(".il_ItemProperties") # The first one is always the filetype file_type = properties_parent.select_one( "span.il_ItemProperty").getText().strip() # The rest does not have a stable order. Grab the whole text and reg-ex the date # out of it all_properties_text = properties_parent.getText().strip() modification_date_match = re.search( r"(((\d+\. \w+ \d+)|(Gestern|Yesterday)|(Heute|Today)|(Morgen|Tomorrow)), \d+:\d+)", all_properties_text) if modification_date_match is None: modification_date = None log.explain(f"Element {name} at {url} has no date.") else: modification_date_str = modification_date_match.group(1) modification_date = demangle_date(modification_date_str) # Grab the name from the link text full_path = name + "." + file_type log.explain(f"Found file {full_path!r}") return IliasPageElement(IliasElementType.FILE, url, full_path, modification_date)
def _find_exercise_entries_detail_page(self) -> List[IliasPageElement]: results: List[IliasPageElement] = [] # Find all download links in the container (this will contain all the files) download_links: List[Tag] = self._soup.findAll( name="a", # download links contain the given command class attrs={"href": lambda x: x and "cmd=download" in x}, text="Download") for link in download_links: parent_row: Tag = link.findParent("tr") children: List[Tag] = parent_row.findChildren("td") name = _sanitize_path_name(children[1].getText().strip()) log.explain(f"Found exercise detail entry {name!r}") for child in reversed(children): date = demangle_date(child.getText().strip(), fail_silently=True) if date is not None: break if date is None: log.warn(f"Date parsing failed for exercise entry {name!r}") results.append( IliasPageElement(IliasElementType.FILE, self._abs_url_from_link(link), name, date)) return results
def _find_video_entries_paginated(self) -> List[IliasPageElement]: table_element: Tag = self._soup.find(name="table", id=re.compile(r"tbl_xoct_.+")) if table_element is None: log.warn( "Couldn't increase elements per page (table not found). I might miss elements." ) return self._find_video_entries_no_paging() id_match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"]) if id_match is None: log.warn( "Couldn't increase elements per page (table id not found). I might miss elements." ) return self._find_video_entries_no_paging() table_id = id_match.group(1) query_params = { f"tbl_xoct_{table_id}_trows": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch" } url = url_set_query_params(self._page_url, query_params) log.explain("Disabled pagination, retrying folder as a new entry") return [IliasPageElement(IliasElementType.VIDEO_FOLDER, url, "")]
def _listed_video_to_element(self, link: Tag) -> IliasPageElement: # The link is part of a table with multiple columns, describing metadata. # 6th or 7th child (1 indexed) is the modification time string. Try to find it # by parsing backwards from the end and finding something that looks like a date modification_time = None row: Tag = link.parent.parent.parent column_count = len(row.select("td.std")) for index in range(column_count, 0, -1): modification_string = link.parent.parent.parent.select_one( f"td.std:nth-child({index})").getText().strip() if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string): modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M") break if modification_time is None: log.warn(f"Could not determine upload time for {link}") modification_time = datetime.now() title = link.parent.parent.parent.select_one( "td.std:nth-child(3)").getText().strip() title += ".mp4" video_name: str = _sanitize_path_name(title) video_url = self._abs_url_from_link(link) log.explain(f"Found video {video_name!r} at {video_url}") return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url, video_name, modification_time)
def _find_normal_entries(self) -> List[IliasPageElement]: result: List[IliasPageElement] = [] # Fetch all links and throw them to the general interpreter links: List[Tag] = self._soup.select("a.il_ContainerItemTitle") for link in links: abs_url = self._abs_url_from_link(link) parents = self._find_upwards_folder_hierarchy(link) if parents: element_name = "/".join(parents) + "/" + _sanitize_path_name( link.getText()) else: element_name = _sanitize_path_name(link.getText()) element_type = self._find_type_from_link(element_name, link, abs_url) description = self._find_link_description(link) # The last meeting on every page is expanded by default. # Its content is then shown inline *and* in the meeting page itself. # We should skip the inline content. if element_type != IliasElementType.MEETING and self._is_in_expanded_meeting( link): continue if not element_type: continue if element_type == IliasElementType.MEETING: normalized = _sanitize_path_name( self._normalize_meeting_name(element_name)) log.explain( f"Normalized meeting name from {element_name!r} to {normalized!r}" ) element_name = normalized elif element_type == IliasElementType.FILE: result.append( self._file_to_element(element_name, abs_url, link)) continue log.explain(f"Found {element_name!r}") result.append( IliasPageElement(element_type, abs_url, element_name, description=description)) result += self._find_cards() return result
def _listed_video_to_element(self, link: Tag) -> IliasPageElement: # The link is part of a table with multiple columns, describing metadata. # 6th child (1 indexed) is the modification time string modification_string = link.parent.parent.parent.select_one( "td.std:nth-child(6)").getText().strip() modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M") title = link.parent.parent.parent.select_one( "td.std:nth-child(3)").getText().strip() title += ".mp4" video_name: str = _sanitize_path_name(title) video_url = self._abs_url_from_link(link) log.explain(f"Found video {video_name!r} at {video_url}") return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url, video_name, modification_time)
def _player_to_video(self) -> List[IliasPageElement]: # Fetch the actual video page. This is a small wrapper page initializing a javscript # player. Sadly we can not execute that JS. The actual video stream url is nowhere # on the page, but defined in a JS object inside a script tag, passed to the player # library. # We do the impossible and RegEx the stream JSON object out of the page's HTML source regex = re.compile(r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE) json_match = regex.search(str(self._soup)) if json_match is None: log.warn( "Could not find JSON stream info in video player. Ignoring video." ) return [] json_str = json_match.group(1) # parse it json_object = json.loads(json_str) streams = [stream for stream in json_object["streams"]] # and just fetch the lone video url! if len(streams) == 1: video_url = streams[0]["sources"]["mp4"][0]["src"] return [ IliasPageElement(IliasElementType.VIDEO, video_url, self._source_name) ] log.explain(f"Found multiple videos for stream at {self._source_name}") items = [] for stream in sorted(streams, key=lambda stream: stream["content"]): full_name = f"{self._source_name.replace('.mp4', '')} ({stream['content']}).mp4" video_url = stream["sources"]["mp4"][0]["src"] items.append( IliasPageElement(IliasElementType.VIDEO, video_url, full_name)) return items
def _find_video_entries(self) -> List[IliasPageElement]: # ILIAS has three stages for video pages # 1. The initial dummy page without any videos. This page contains the link to the listing # 2. The video listing which might be paginated # 3. An unpaginated video listing (or at least one that includes 800 videos) # # We need to figure out where we are. video_element_table: Tag = self._soup.find( name="table", id=re.compile(r"tbl_xoct_.+")) if video_element_table is None: # We are in stage 1 # The page is actually emtpy but contains the link to stage 2 content_link: Tag = self._soup.select_one("#tab_series a") url: str = self._abs_url_from_link(content_link) query_params = { "limit": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch" } url = url_set_query_params(url, query_params) log.explain( "Found ILIAS video frame page, fetching actual content next") return [ IliasPageElement(IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED, url, "") ] is_paginated = self._soup.find( id=re.compile(r"tab_page_sel.+")) is not None if is_paginated and not self._page_type == IliasElementType.VIDEO_FOLDER: # We are in stage 2 - try to break pagination return self._find_video_entries_paginated() return self._find_video_entries_no_paging()
def _find_personal_desktop_entries(self) -> List[IliasPageElement]: items: List[IliasPageElement] = [] titles: List[Tag] = self._soup.select(".il-item-title") for title in titles: link = title.find("a") name = _sanitize_path_name(link.text.strip()) url = self._abs_url_from_link(link) type = self._find_type_from_link(name, link, url) if not type: _unexpected_html_warning() log.warn_contd(f"Could not extract type for {link}") continue log.explain(f"Found {name!r}") if type == IliasElementType.FILE and "_download" not in url: url = re.sub(r"(target=file_\d+)", r"\1_download", url) log.explain("Rewired file URL to include download part") items.append(IliasPageElement(type, url, name)) return items
def get_child_elements(self) -> List[IliasPageElement]: """ Return all child page elements you can find here. """ if self._is_video_player(): log.explain("Page is a video player, extracting URL") return self._player_to_video() if self._is_video_listing(): log.explain("Page is a video listing, searching for elements") return self._find_video_entries() if self._is_exercise_file(): log.explain("Page is an exercise, searching for elements") return self._find_exercise_entries() if self._is_personal_desktop(): log.explain("Page is the personal desktop, searching for elements") return self._find_personal_desktop_entries() if self._is_content_page(): log.explain("Page is a content page, searching for elements") return self._find_copa_entries() log.explain("Page is a normal folder, searching for elements") return self._find_normal_entries()
def _find_exercise_entries_root_page(self) -> List[IliasPageElement]: results: List[IliasPageElement] = [] # Each assignment is in an accordion container assignment_containers: List[Tag] = self._soup.select( ".il_VAccordionInnerContainer") for container in assignment_containers: # Fetch the container name out of the header to use it in the path container_name = container.select_one( ".ilAssignmentHeader").getText().strip() log.explain(f"Found exercise container {container_name!r}") # Find all download links in the container (this will contain all the files) files: List[Tag] = container.findAll( name="a", # download links contain the given command class attrs={ "href": lambda x: x and "cmdClass=ilexsubmissiongui" in x }, text="Download") # Grab each file as you now have the link for file_link in files: # Two divs, side by side. Left is the name, right is the link ==> get left # sibling file_name = file_link.parent.findPrevious( name="div").getText().strip() file_name = _sanitize_path_name(file_name) url = self._abs_url_from_link(file_link) log.explain(f"Found exercise entry {file_name!r}") results.append( IliasPageElement( IliasElementType.FILE, url, container_name + "/" + file_name, None # We do not have any timestamp )) # Find all links to file listings (e.g. "Submitted Files" for groups) file_listings: List[Tag] = container.findAll( name="a", # download links contain the given command class attrs={ "href": lambda x: x and "cmdClass=ilexsubmissionfilegui" in x }) # Add each listing as a new for listing in file_listings: parent_container: Tag = listing.findParent( "div", attrs={"class": lambda x: x and "form-group" in x}) label_container: Tag = parent_container.find( attrs={"class": lambda x: x and "control-label" in x}) file_name = _sanitize_path_name( label_container.getText().strip()) url = self._abs_url_from_link(listing) log.explain(f"Found exercise detail {file_name!r} at {url}") results.append( IliasPageElement( IliasElementType.EXERCISE_FILES, url, container_name + "/" + file_name, None # we do not have any timestamp )) return results