def _find_exercise_entries_detail_page(self) -> List[IliasPageElement]: results: List[IliasPageElement] = [] # Find all download links in the container (this will contain all the files) download_links: List[Tag] = self._soup.findAll( name="a", # download links contain the given command class attrs={"href": lambda x: x and "cmd=download" in x}, text="Download") for link in download_links: parent_row: Tag = link.findParent("tr") children: List[Tag] = parent_row.findChildren("td") name = _sanitize_path_name(children[1].getText().strip()) log.explain(f"Found exercise detail entry {name!r}") for child in reversed(children): date = demangle_date(child.getText().strip(), fail_silently=True) if date is not None: break if date is None: log.warn(f"Date parsing failed for exercise entry {name!r}") results.append( IliasPageElement(IliasElementType.FILE, self._abs_url_from_link(link), name, date)) return results
def _find_video_entries_paginated(self) -> List[IliasPageElement]: table_element: Tag = self._soup.find(name="table", id=re.compile(r"tbl_xoct_.+")) if table_element is None: log.warn( "Couldn't increase elements per page (table not found). I might miss elements." ) return self._find_video_entries_no_paging() id_match = re.match(r"tbl_xoct_(.+)", table_element.attrs["id"]) if id_match is None: log.warn( "Couldn't increase elements per page (table id not found). I might miss elements." ) return self._find_video_entries_no_paging() table_id = id_match.group(1) query_params = { f"tbl_xoct_{table_id}_trows": "800", "cmd": "asyncGetTableGUI", "cmdMode": "asynch" } url = url_set_query_params(self._page_url, query_params) log.explain("Disabled pagination, retrying folder as a new entry") return [IliasPageElement(IliasElementType.VIDEO_FOLDER, url, "")]
def _listed_video_to_element(self, link: Tag) -> IliasPageElement: # The link is part of a table with multiple columns, describing metadata. # 6th or 7th child (1 indexed) is the modification time string. Try to find it # by parsing backwards from the end and finding something that looks like a date modification_time = None row: Tag = link.parent.parent.parent column_count = len(row.select("td.std")) for index in range(column_count, 0, -1): modification_string = link.parent.parent.parent.select_one( f"td.std:nth-child({index})").getText().strip() if re.search(r"\d+\.\d+.\d+ - \d+:\d+", modification_string): modification_time = datetime.strptime(modification_string, "%d.%m.%Y - %H:%M") break if modification_time is None: log.warn(f"Could not determine upload time for {link}") modification_time = datetime.now() title = link.parent.parent.parent.select_one( "td.std:nth-child(3)").getText().strip() title += ".mp4" video_name: str = _sanitize_path_name(title) video_url = self._abs_url_from_link(link) log.explain(f"Found video {video_name!r} at {video_url}") return IliasPageElement(IliasElementType.VIDEO_PLAYER, video_url, video_name, modification_time)
def demangle_date(date_str: str, fail_silently: bool = False) -> Optional[datetime]: """ Demangle a given date in one of the following formats (hour/minute part is optional): "Gestern, HH:MM" "Heute, HH:MM" "Morgen, HH:MM" "dd. mon yyyy, HH:MM """ try: # Normalize whitespace because users date_str = re.sub(r"\s+", " ", date_str) date_str = re.sub("Gestern|Yesterday", _format_date_english(_yesterday()), date_str, re.I) date_str = re.sub("Heute|Today", _format_date_english(date.today()), date_str, re.I) date_str = re.sub("Morgen|Tomorrow", _format_date_english(_tomorrow()), date_str, re.I) date_str = date_str.strip() for german, english in zip(german_months, english_months): date_str = date_str.replace(german, english) # Remove trailing dots for abbreviations, e.g. "20. Apr. 2020" -> "20. Apr 2020" date_str = date_str.replace(english + ".", english) # We now have a nice english String in the format: "dd. mmm yyyy, hh:mm" or "dd. mmm yyyy" # Check if we have a time as well if ", " in date_str: day_part, time_part = date_str.split(",") else: day_part = date_str.split(",")[0] time_part = None day_str, month_str, year_str = day_part.split(" ") day = int(day_str.strip().replace(".", "")) month = english_months.index(month_str.strip()) + 1 year = int(year_str.strip()) if time_part: hour_str, minute_str = time_part.split(":") hour = int(hour_str) minute = int(minute_str) return datetime(year, month, day, hour, minute) return datetime(year, month, day) except Exception: if not fail_silently: log.warn(f"Date parsing failed for {date_str!r}") return None
def _player_to_video(self) -> List[IliasPageElement]: # Fetch the actual video page. This is a small wrapper page initializing a javscript # player. Sadly we can not execute that JS. The actual video stream url is nowhere # on the page, but defined in a JS object inside a script tag, passed to the player # library. # We do the impossible and RegEx the stream JSON object out of the page's HTML source regex = re.compile(r"({\"streams\"[\s\S]+?),\s*{\"paella_config_file", re.IGNORECASE) json_match = regex.search(str(self._soup)) if json_match is None: log.warn( "Could not find JSON stream info in video player. Ignoring video." ) return [] json_str = json_match.group(1) # parse it json_object = json.loads(json_str) streams = [stream for stream in json_object["streams"]] # and just fetch the lone video url! if len(streams) == 1: video_url = streams[0]["sources"]["mp4"][0]["src"] return [ IliasPageElement(IliasElementType.VIDEO, video_url, self._source_name) ] log.explain(f"Found multiple videos for stream at {self._source_name}") items = [] for stream in sorted(streams, key=lambda stream: stream["content"]): full_name = f"{self._source_name.replace('.mp4', '')} ({stream['content']}).mp4" video_url = stream["sources"]["mp4"][0]["src"] items.append( IliasPageElement(IliasElementType.VIDEO, video_url, full_name)) return items
class IliasPageElement: type: IliasElementType url: str name: str mtime: Optional[datetime] = None description: Optional[str] = None def id(self) -> str: regexes = [ r"eid=(?P<id>[0-9a-z\-]+)", r"file_(?P<id>\d+)", r"ref_id=(?P<id>\d+)", r"target=[a-z]+_(?P<id>\d+)" ] for regex in regexes: if match := re.search(regex, self.url): return match.groupdict()["id"] # Fall back to URL log.warn( f"Didn't find identity for {self.name} - {self.url}. Please report this." ) return self.url
def _unexpected_html_warning() -> None: log.warn("Encountered unexpected HTML structure, ignoring element.")