async def get_folder_name(session, url, **kwargs): async with session.get(url) as response: html = await response.text() soup = BeautifulSoup(html, get_beautiful_soup_parser()) title = soup.find("title") return str(title.string)
async def _producer(session, queue, url, base_path, session_kwargs): if url[-1] != "/": url += "/" async with session.get(url, **session_kwargs) as response: html = await response.text() soup = BeautifulSoup(html, get_beautiful_soup_parser()) links = soup.find_all("a") tasks = [] for link in links: href = link.get("href") if unquote(href) != str(link.string).strip(): continue if href[-1] == "/": href = href[:-1] path = safe_path_join(base_path, href) if "." in href: checksum = str(link.next_sibling.string).strip() await queue.put({"url": url + href, "path": path, "session_kwargs": session_kwargs, "checksum": checksum}) else: coroutine = _producer(session, queue, url + href, path, session_kwargs) tasks.append(asyncio.ensure_future(coroutine)) await asyncio.gather(*tasks)
async def validate_url(session, queue, links_to_pdf, base_url, base_path, **kwargs): async with session.get(base_url, **kwargs) as response: html = await response.text() soup = BeautifulSoup(html, get_beautiful_soup_parser()) all_urls_from_site = set([]) links = soup.find_all("a") for link in links: all_urls_from_site.add(link.get("href")) for i in range(20): path = os.path.join(base_path, "Woche {}".format(i)) for name, url in links_to_pdf.items(): real_url = url(i) if real_url not in all_urls_from_site: continue item_path = os.path.join(path, name + f" {i}.pdf") await queue.put({ "path": item_path, "url": base_url + real_url, "session_kwargs": kwargs })
async def parse_single_section(session, queue, download_settings, base_path, href, moodle_id, last_updated_dict, process_external_links, keep_file_order, password_mapper): async with session.get(href) as response: html = await response.read() section_id = re.search(r"§ion=([0-9]+)", href).group(1) only_sections = SoupStrainer("li", id=f"section-{section_id}") soup = BeautifulSoup(html, get_beautiful_soup_parser(), parse_only=only_sections) section = soup.find("li", id=f"section-{section_id}") await _parse_section(session=session, queue=queue, download_settings=download_settings, base_path=base_path, section=section, last_updated_dict=last_updated_dict, moodle_id=moodle_id, process_external_links=process_external_links, keep_file_order=keep_file_order, password_mapper=password_mapper)
async def get_folder_name(session, url, **kwargs): async with session.get(url) as response: html = await response.text() soup = BeautifulSoup(html, get_beautiful_soup_parser()) header_name = str(soup.head.title.string) name = re.search("/~([^/]+)/", header_name)[1] return name
async def get_assign_files_tree(session, href): async with session.get(href) as response: text = await response.text() assign_files_tree = SoupStrainer( "div", id=re.compile("assign_files_tree[0-9a-f]*")) return BeautifulSoup(text, get_beautiful_soup_parser(), parse_only=assign_files_tree)
async def get_folder_name(session, ilias_id, **kwargs): url = GOTO_URL + str(ilias_id) async with session.get(url) as response: html = await response.text() soup = BeautifulSoup(html, get_beautiful_soup_parser()) ol = soup.find("ol", class_="breadcrumb") name = str(ol.find_all("li")[2].string) return remove_vz_id(name)
async def get_filemanager(session, href): async with session.get(href) as response: text = await response.text() only_file_tree = SoupStrainer("div", id=re.compile("folder_tree[0-9]+"), class_="filemanager") return BeautifulSoup(text, get_beautiful_soup_parser(), parse_only=only_file_tree)
async def _get_folder_name_s(session, poly_type, poly_id, password=None): if password is not None: await login_folder(session, poly_type, poly_id, password) url = INDEX_URL + poly_type + "/" + poly_id async with session.get(url=url) as response: html = await response.text() soup = BeautifulSoup(html, get_beautiful_soup_parser()) data_info = soup.body.header.div author = " ".join(data_info["data-owner-display-name"].split(" ")[:2]) name = data_info["data-name"] return name
def _get_page_meta(html, keys): possible_keys = set(keys) soup = BeautifulSoup(html, get_beautiful_soup_parser()) result = {} for line in str(soup.findChildren("script")).split(','): for key in possible_keys: if key in line: if "\"" in line: value = line.split("\"")[1] else: value = line.split("'")[1] result[key] = value.strip() possible_keys.remove(key) break return result
async def parse_main_page(session, queue, html, base_path, download_settings, moodle_id, process_external_links, keep_section_order, keep_file_order, password_mapper): sesskey = re.search(b"""sesskey":"([^"]+)""", html)[1].decode("utf-8") update_json = await get_update_json(session=session, moodle_id=moodle_id, sesskey=sesskey) last_updated_dict = parse_update_json(update_json) only_sections = SoupStrainer("li", id=re.compile("section-([0-9]+)")) soup = BeautifulSoup(html, get_beautiful_soup_parser(), parse_only=only_sections) sections = soup.find_all("li", id=re.compile("section-([0-9]+)"), recursive=False) coroutines = [ parse_sections(session=session, queue=queue, section=section, base_path=base_path, download_settings=download_settings, moodle_id=moodle_id, process_external_links=process_external_links, last_updated_dict=last_updated_dict, password_mapper=password_mapper, index=index, keep_section_order=keep_section_order, keep_file_order=keep_file_order) for index, section in enumerate(sections) ] await asyncio.gather(*coroutines)
async def get_all_file_links(session, url, session_kwargs): async with session.get(url, **session_kwargs) as response: html = await response.text() all_links = dict() soup = BeautifulSoup(html, get_beautiful_soup_parser()) links = soup.find_all("a") for link in links: href = link.get("href", None) if not href: continue o = urlparse(href) if "." not in o.path: continue result = urljoin(url, href) all_links[result] = str(link.text) return all_links
async def search_tree(session, queue, base_path, download_settings, ilias_id): url = GOTO_URL + str(ilias_id) async with session.get(url) as response: html = await response.text() if str(response.url) != url: raise LoginError( "Module ilias isn't logged in or you are not allowed to access these files" ) strainer = SoupStrainer("div", attrs={"class": "ilCLI ilObjListRow row"}) soup = BeautifulSoup(html, get_beautiful_soup_parser(), parse_only=strainer) rows = soup.find_all("div", attrs={"class": "ilCLI ilObjListRow row"}) tasks = [] for row in rows: content = row.find("div", attrs={"class": "ilContainerListItemContent"}) link = content.find("a") href = link["href"] name = str(link.string) path = safe_path_join(base_path, name) if "download" in href: extension = str( content.find("span", attrs={ "class": "il_ItemProperty" }).string).strip() checksum = "".join([ str(x.string).strip() for x in content.find_all("span", attrs={"class": "il_ItemProperty"}) ]) if "Today" in checksum: today_date = datetime.datetime.now() checksum = checksum.replace( "Today", format_datetime(today_date, locale='en', format="dd. MMM YYYY")) elif "Yesterday" in checksum: yesterday_date = datetime.datetime.now() - datetime.timedelta( days=1) checksum = checksum.replace( "Yesterday", format_datetime(yesterday_date, locale='en', format="dd. MMM YYYY")) await queue.put({ "url": href, "path": f"{path}.{extension}", "checksum": checksum }) else: ref_id = re.search("ref_id=([0-9]+)&", href).group(1) coroutine = search_tree(session, queue, path, download_settings, ref_id) tasks.append(asyncio.ensure_future(coroutine)) await asyncio.gather(*tasks)