Python get_beautiful_soup_parser Examples, core.utils.get_beautiful_soup_parser Python Examples

Example #1

0

Show file

File: producer.py Project: GeorgOhneH/ethz-document-fetcher

async def get_folder_name(session, url, **kwargs):
    async with session.get(url) as response:
        html = await response.text()
    soup = BeautifulSoup(html, get_beautiful_soup_parser())
    title = soup.find("title")

    return str(title.string)

Example #2

0

Show file

File: producer.py Project: GeorgOhneH/ethz-document-fetcher

async def _producer(session, queue, url, base_path, session_kwargs):
    if url[-1] != "/":
        url += "/"

    async with session.get(url, **session_kwargs) as response:
        html = await response.text()

    soup = BeautifulSoup(html, get_beautiful_soup_parser())

    links = soup.find_all("a")
    tasks = []
    for link in links:
        href = link.get("href")
        if unquote(href) != str(link.string).strip():
            continue

        if href[-1] == "/":
            href = href[:-1]

        path = safe_path_join(base_path, href)

        if "." in href:
            checksum = str(link.next_sibling.string).strip()
            await queue.put({"url": url + href,
                             "path": path,
                             "session_kwargs": session_kwargs,
                             "checksum": checksum})
        else:
            coroutine = _producer(session, queue, url + href, path, session_kwargs)
            tasks.append(asyncio.ensure_future(coroutine))

    await asyncio.gather(*tasks)

Example #3

0

Show file

async def validate_url(session, queue, links_to_pdf, base_url, base_path,
                       **kwargs):
    async with session.get(base_url, **kwargs) as response:
        html = await response.text()

    soup = BeautifulSoup(html, get_beautiful_soup_parser())

    all_urls_from_site = set([])

    links = soup.find_all("a")
    for link in links:
        all_urls_from_site.add(link.get("href"))

    for i in range(20):
        path = os.path.join(base_path, "Woche {}".format(i))
        for name, url in links_to_pdf.items():
            real_url = url(i)
            if real_url not in all_urls_from_site:
                continue

            item_path = os.path.join(path, name + f" {i}.pdf")
            await queue.put({
                "path": item_path,
                "url": base_url + real_url,
                "session_kwargs": kwargs
            })

Example #4

0

Show file

File: parser.py Project: GeorgOhneH/ethz-document-fetcher

async def parse_single_section(session, queue, download_settings, base_path,
                               href, moodle_id, last_updated_dict,
                               process_external_links, keep_file_order,
                               password_mapper):
    async with session.get(href) as response:
        html = await response.read()

    section_id = re.search(r"&section=([0-9]+)", href).group(1)

    only_sections = SoupStrainer("li", id=f"section-{section_id}")
    soup = BeautifulSoup(html,
                         get_beautiful_soup_parser(),
                         parse_only=only_sections)
    section = soup.find("li", id=f"section-{section_id}")

    await _parse_section(session=session,
                         queue=queue,
                         download_settings=download_settings,
                         base_path=base_path,
                         section=section,
                         last_updated_dict=last_updated_dict,
                         moodle_id=moodle_id,
                         process_external_links=process_external_links,
                         keep_file_order=keep_file_order,
                         password_mapper=password_mapper)

Example #5

0

Show file

File: producer.py Project: GeorgOhneH/ethz-document-fetcher

async def get_folder_name(session, url, **kwargs):
    async with session.get(url) as response:
        html = await response.text()
    soup = BeautifulSoup(html, get_beautiful_soup_parser())

    header_name = str(soup.head.title.string)
    name = re.search("/~([^/]+)/", header_name)[1]

    return name

Example #6

0

Show file

File: parser.py Project: GeorgOhneH/ethz-document-fetcher

async def get_assign_files_tree(session, href):
    async with session.get(href) as response:
        text = await response.text()

    assign_files_tree = SoupStrainer(
        "div", id=re.compile("assign_files_tree[0-9a-f]*"))
    return BeautifulSoup(text,
                         get_beautiful_soup_parser(),
                         parse_only=assign_files_tree)

Example #7

0

Show file

File: producer.py Project: GeorgOhneH/ethz-document-fetcher

async def get_folder_name(session, ilias_id, **kwargs):
    url = GOTO_URL + str(ilias_id)
    async with session.get(url) as response:
        html = await response.text()

    soup = BeautifulSoup(html, get_beautiful_soup_parser())

    ol = soup.find("ol", class_="breadcrumb")
    name = str(ol.find_all("li")[2].string)
    return remove_vz_id(name)

Example #8

0

Show file

File: parser.py Project: GeorgOhneH/ethz-document-fetcher

async def get_filemanager(session, href):
    async with session.get(href) as response:
        text = await response.text()

    only_file_tree = SoupStrainer("div",
                                  id=re.compile("folder_tree[0-9]+"),
                                  class_="filemanager")
    return BeautifulSoup(text,
                         get_beautiful_soup_parser(),
                         parse_only=only_file_tree)

Example #9

0

Show file

async def _get_folder_name_s(session, poly_type, poly_id, password=None):
    if password is not None:
        await login_folder(session, poly_type, poly_id, password)

    url = INDEX_URL + poly_type + "/" + poly_id

    async with session.get(url=url) as response:
        html = await response.text()

    soup = BeautifulSoup(html, get_beautiful_soup_parser())

    data_info = soup.body.header.div
    author = " ".join(data_info["data-owner-display-name"].split(" ")[:2])
    name = data_info["data-name"]
    return name

Example #10

0

Show file

def _get_page_meta(html, keys):
    possible_keys = set(keys)
    soup = BeautifulSoup(html, get_beautiful_soup_parser())
    result = {}

    for line in str(soup.findChildren("script")).split(','):
        for key in possible_keys:
            if key in line:
                if "\"" in line:
                    value = line.split("\"")[1]
                else:
                    value = line.split("'")[1]
                result[key] = value.strip()
                possible_keys.remove(key)
                break

    return result

Example #11

0

Show file

File: parser.py Project: GeorgOhneH/ethz-document-fetcher

async def parse_main_page(session, queue, html, base_path, download_settings,
                          moodle_id, process_external_links,
                          keep_section_order, keep_file_order,
                          password_mapper):
    sesskey = re.search(b"""sesskey":"([^"]+)""", html)[1].decode("utf-8")

    update_json = await get_update_json(session=session,
                                        moodle_id=moodle_id,
                                        sesskey=sesskey)

    last_updated_dict = parse_update_json(update_json)

    only_sections = SoupStrainer("li", id=re.compile("section-([0-9]+)"))
    soup = BeautifulSoup(html,
                         get_beautiful_soup_parser(),
                         parse_only=only_sections)

    sections = soup.find_all("li",
                             id=re.compile("section-([0-9]+)"),
                             recursive=False)

    coroutines = [
        parse_sections(session=session,
                       queue=queue,
                       section=section,
                       base_path=base_path,
                       download_settings=download_settings,
                       moodle_id=moodle_id,
                       process_external_links=process_external_links,
                       last_updated_dict=last_updated_dict,
                       password_mapper=password_mapper,
                       index=index,
                       keep_section_order=keep_section_order,
                       keep_file_order=keep_file_order)
        for index, section in enumerate(sections)
    ]
    await asyncio.gather(*coroutines)

Example #12

0

Show file

File: producer.py Project: GeorgOhneH/ethz-document-fetcher

async def get_all_file_links(session, url, session_kwargs):
    async with session.get(url, **session_kwargs) as response:
        html = await response.text()

    all_links = dict()

    soup = BeautifulSoup(html, get_beautiful_soup_parser())

    links = soup.find_all("a")
    for link in links:
        href = link.get("href", None)
        if not href:
            continue

        o = urlparse(href)

        if "." not in o.path:
            continue

        result = urljoin(url, href)

        all_links[result] = str(link.text)

    return all_links

Example #13

0

Show file

File: producer.py Project: GeorgOhneH/ethz-document-fetcher

async def search_tree(session, queue, base_path, download_settings, ilias_id):
    url = GOTO_URL + str(ilias_id)
    async with session.get(url) as response:
        html = await response.text()
        if str(response.url) != url:
            raise LoginError(
                "Module ilias isn't logged in or you are not allowed to access these files"
            )

    strainer = SoupStrainer("div", attrs={"class": "ilCLI ilObjListRow row"})
    soup = BeautifulSoup(html,
                         get_beautiful_soup_parser(),
                         parse_only=strainer)
    rows = soup.find_all("div", attrs={"class": "ilCLI ilObjListRow row"})
    tasks = []
    for row in rows:
        content = row.find("div",
                           attrs={"class": "ilContainerListItemContent"})
        link = content.find("a")
        href = link["href"]
        name = str(link.string)
        path = safe_path_join(base_path, name)
        if "download" in href:
            extension = str(
                content.find("span", attrs={
                    "class": "il_ItemProperty"
                }).string).strip()
            checksum = "".join([
                str(x.string).strip()
                for x in content.find_all("span",
                                          attrs={"class": "il_ItemProperty"})
            ])

            if "Today" in checksum:
                today_date = datetime.datetime.now()
                checksum = checksum.replace(
                    "Today",
                    format_datetime(today_date,
                                    locale='en',
                                    format="dd. MMM YYYY"))
            elif "Yesterday" in checksum:
                yesterday_date = datetime.datetime.now() - datetime.timedelta(
                    days=1)
                checksum = checksum.replace(
                    "Yesterday",
                    format_datetime(yesterday_date,
                                    locale='en',
                                    format="dd. MMM YYYY"))

            await queue.put({
                "url": href,
                "path": f"{path}.{extension}",
                "checksum": checksum
            })
        else:
            ref_id = re.search("ref_id=([0-9]+)&", href).group(1)
            coroutine = search_tree(session, queue, path, download_settings,
                                    ref_id)
            tasks.append(asyncio.ensure_future(coroutine))

    await asyncio.gather(*tasks)