def _find_type_from_folder_like(link_element: Tag, url: str) -> Optional[IliasElementType]: """ Try crawling something that looks like a folder. """ # pylint: disable=too-many-return-statements found_parent: Optional[Tag] = None # We look for the outer div of our inner link, to find information around it # (mostly the icon) for parent in link_element.parents: if "ilContainerListItemOuter" in parent[ "class"] or "il-std-item" in parent["class"]: found_parent = parent break if found_parent is None: _unexpected_html_warning() log.warn_contd( f"Tried to figure out element type, but did not find an icon for {url}" ) return None # Find the small descriptive icon to figure out the type img_tag: Optional[Tag] = found_parent.select_one("img.ilListItemIcon") if img_tag is None: img_tag = found_parent.select_one("img.icon") if img_tag is None: _unexpected_html_warning() log.warn_contd( f"Tried to figure out element type, but did not find an image for {url}" ) return None if "opencast" in str(img_tag["alt"]).lower(): return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED if str(img_tag["src"]).endswith("icon_exc.svg"): return IliasElementType.EXERCISE if str(img_tag["src"]).endswith("icon_webr.svg"): return IliasElementType.LINK if str(img_tag["src"]).endswith("icon_book.svg"): return IliasElementType.BOOKING if str(img_tag["src"]).endswith("frm.svg"): return IliasElementType.FORUM if str(img_tag["src"]).endswith("sess.svg"): return IliasElementType.MEETING if str(img_tag["src"]).endswith("icon_tst.svg"): return IliasElementType.TEST return IliasElementType.FOLDER
def _find_type_from_card(self, card_title: Tag) -> Optional[IliasElementType]: def is_card_root(element: Tag) -> bool: return "il-card" in element["class"] and "thumbnail" in element[ "class"] card_root: Optional[Tag] = None # We look for the card root for parent in card_title.parents: if is_card_root(parent): card_root = parent break if card_root is None: _unexpected_html_warning() log.warn_contd( f"Tried to figure out element type, but did not find an icon for {card_title}" ) return None icon: Tag = card_root.select_one(".il-card-repository-head .icon") if "opencast" in icon["class"]: return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED if "exc" in icon["class"]: return IliasElementType.EXERCISE if "webr" in icon["class"]: return IliasElementType.LINK if "book" in icon["class"]: return IliasElementType.BOOKING if "frm" in icon["class"]: return IliasElementType.FORUM if "sess" in icon["class"]: return IliasElementType.MEETING if "tst" in icon["class"]: return IliasElementType.TEST if "fold" in icon["class"]: return IliasElementType.FOLDER _unexpected_html_warning() log.warn_contd( f"Could not extract type from {icon} for card title {card_title}") return None
def _find_type_from_link(element_name: str, link_element: Tag, url: str) -> Optional[IliasElementType]: """ Decides which sub crawler to use for a given top level element. """ parsed_url = urlparse(url) # file URLs contain "target=file" if "target=file_" in parsed_url.query: return IliasElementType.FILE if "target=grp_" in parsed_url.query: return IliasElementType.FOLDER if "target=crs_" in parsed_url.query: return IliasElementType.FOLDER if "baseClass=ilExerciseHandlerGUI" in parsed_url.query: return IliasElementType.EXERCISE if "baseClass=ilLinkResourceHandlerGUI" in parsed_url.query and "calldirectlink" in parsed_url.query: return IliasElementType.LINK if "cmd=showThreads" in parsed_url.query or "target=frm_" in parsed_url.query: return IliasElementType.FORUM if "cmdClass=ilobjtestgui" in parsed_url.query: return IliasElementType.TEST # Booking and Meeting can not be detected based on the link. They do have a ref_id though, so # try to guess it from the image. # Everything with a ref_id can *probably* be opened to reveal nested things # video groups, directories, exercises, etc if "ref_id=" in parsed_url.query or "goto.php" in parsed_url.path: return IliasPage._find_type_from_folder_like(link_element, url) _unexpected_html_warning() log.warn_contd( f"Tried to figure out element type, but failed for {element_name!r} / {link_element!r})" ) return None
def _find_copa_entries(self) -> List[IliasPageElement]: items: List[IliasPageElement] = [] links: List[Tag] = self._soup.findAll( class_="ilc_flist_a_FileListItemLink") for link in links: url = self._abs_url_from_link(link) name = _sanitize_path_name(link.getText().strip().replace( "\t", "")) if "file_id" not in url: _unexpected_html_warning() log.warn_contd( f"Found unknown content page item {name!r} with url {url!r}" ) continue items.append(IliasPageElement(IliasElementType.FILE, url, name)) return items
def _find_personal_desktop_entries(self) -> List[IliasPageElement]: items: List[IliasPageElement] = [] titles: List[Tag] = self._soup.select(".il-item-title") for title in titles: link = title.find("a") name = _sanitize_path_name(link.text.strip()) url = self._abs_url_from_link(link) type = self._find_type_from_link(name, link, url) if not type: _unexpected_html_warning() log.warn_contd(f"Could not extract type for {link}") continue log.explain(f"Found {name!r}") if type == IliasElementType.FILE and "_download" not in url: url = re.sub(r"(target=file_\d+)", r"\1_download", url) log.explain("Rewired file URL to include download part") items.append(IliasPageElement(type, url, name)) return items
def _find_cards(self) -> List[IliasPageElement]: result: List[IliasPageElement] = [] card_titles: List[Tag] = self._soup.select(".card-title a") for title in card_titles: url = self._abs_url_from_link(title) name = _sanitize_path_name(title.getText().strip()) type = self._find_type_from_card(title) if not type: _unexpected_html_warning() log.warn_contd(f"Could not extract type for {title}") continue result.append(IliasPageElement(type, url, name)) card_button_tiles: List[Tag] = self._soup.select(".card-title button") for button in card_button_tiles: regex = re.compile(button["id"] + r".*window.open\(['\"](.+?)['\"]") res = regex.search(str(self._soup)) if not res: _unexpected_html_warning() log.warn_contd( f"Could not find click handler target for {button}") continue url = self._abs_url_from_relative(res.group(1)) name = _sanitize_path_name(button.getText().strip()) type = self._find_type_from_card(button) caption_parent = button.findParent( "div", attrs={"class": lambda x: x and "caption" in x}, ) description = caption_parent.find_next_sibling( "div").getText().strip() if not type: _unexpected_html_warning() log.warn_contd(f"Could not extract type for {button}") continue result.append( IliasPageElement(type, url, name, description=description)) return result