Python get_soup Beispiele, scripts_utils.get_soup Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: de-langs.py Projekt: BoboTiG/ebook-reader-dict

def process_page(page_url, languages):
    soup = get_soup(page_url)

    nextpage = ""
    nextpage_div = soup.find(id="mw-pages")
    last_link = nextpage_div.find_all("a")[-1]
    if NEXTPAGE_TEXT == last_link.text:
        nextpage = ROOT_URL + last_link.get("href")

    content = soup.find("div", {"class": "mw-category"})
    lis = content.findAll("li")
    for li in lis:
        link = li.find("a")["href"]
        li_url = ROOT_URL + link
        key = li.text.split(":")[1]
        sub_soup = get_soup(li_url)
        content = sub_soup.find("div", {"class": "mw-parser-output"}).find(
            "p", recursive=False
        )
        value = content.text.strip()
        languages[key] = value
        a_url = ALIAS_URL.format(li.text)
        soup_alias = get_soup(a_url)
        if ul_alias := soup_alias.find("ul", {"id": "mw-whatlinkshere-list"}):
            for alias_li in ul_alias.findAll("li"):
                alias_text = alias_li.find("a").text
                alias_key = alias_text.split(":")[1]
                languages[alias_key] = value

Beispiel #2

0

Datei anzeigen

def process_cs_page(url, results):
    soup = get_soup(url)

    nextpage = ""
    nextpage_div = soup.find(id="mw-pages")
    last_link = nextpage_div.find_all("a")[-1]
    if NEXTPAGE_TEXT == last_link.text:
        nextpage = ROOT_URL + last_link.get("href")

    divs_category = soup.find_all("div", {"class": "mw-category-group"})
    for divs_category in divs_category:
        lis = divs_category.find_all("li")
        for li in lis:
            template_link = li.find("a")
            template_url = ROOT_URL + template_link.get("href")
            template_name = template_link.text.split(":")[1]
            template_soup = get_soup(template_url)
            template_text_div = template_soup.find(
                "div", {"class": "mw-parser-output"})
            template_text = template_text_div.find("p").text.strip()
            if template_text[-1] == ".":
                template_text = template_text[:-1]
            results[template_name] = template_text
            process_alias_page(template_link.text, template_text, results)

    return nextpage

Beispiel #3

0

Datei anzeigen

def get_content(url):
    soup = get_soup(url)
    content_div = soup.find("div", "mw-parser-output")
    content_div = content_div.findChild(
        "div", {"class": "mw-highlight"}, recursive=False
    )
    return content_div.text.split("\n")

Beispiel #4

0

Datei anzeigen

def get_text(url):
    soup = get_soup(url)
    div = soup.find("span", "form-of-definition")
    if not div:
        return ""
    res = div.text.replace(" term", "")
    res = res.replace(" [Term?]", "")
    return res

Beispiel #5

0

Datei anzeigen

def process_alias_page(key, value, results):
    url = ALIAS_URL.format(key)
    soup = get_soup(url)
    ul = soup.find("ul", {"id": ["mw-whatlinkshere-list"]})
    if not ul:
        return
    for alias in ul.find_all("a", {"class": ["mw-redirect"]}):
        alias = alias.text.replace("Modèle:", "")
        if alias == "modifier":
            continue
        results[alias] = value

Beispiel #6

0

Datei anzeigen

def process_regions_page(url, results):
    soup = get_soup(url)

    nextpage = ""
    nextpage_div = soup.find(id="mw-pages")
    last_link = nextpage_div.find_all("a")[-1]
    if NEXTPAGE_TEXT == last_link.text:
        nextpage = ROOT + last_link.get("href")

    content_div = soup.find("div", "mw-category-generated")
    lis = content_div.find_all("li")
    for li in lis:
        template_url = ROOT + li.find("a").get("href")
        template_name = li.text.split(":")[1]
        template_soup = get_soup(template_url)
        region = template_soup.find("span", {"id": ["région"]})
        if not region:
            continue
        results[template_name] = region.text.strip("()")
    return nextpage

Beispiel #7

0

Datei anzeigen

def process_alias_page(model, template_text, results):
    url = ALIAS_URL.format(model)
    soup = get_soup(url)
    ul = soup.find("ul", {"id": ["mw-whatlinkshere-list"]})
    if not ul:
        return
    for alias in ul.find_all("a", {"class": ["mw-redirect"]}):
        alias = alias.text.replace("Plantilla:", "")
        if alias == "editar":
            continue
        results[alias] = template_text

Beispiel #8

0

Datei anzeigen

def process_category_page(url, results):
    soup = get_soup(url)

    nextpage = ""
    nextpage_div = soup.find(id="mw-pages")
    last_link = nextpage_div.find_all("a")[-1]
    if NEXTPAGE_TEXT == last_link.text:
        nextpage = ROOT + last_link.get("href")

    content_div = soup.find("div", "mw-category-generated")
    lis = content_div.find_all("li")
    for li in lis:
        template_url = ROOT + li.find("a").get("href")
        template_name = li.text.split(":")[1]
        template_soup = get_soup(template_url)
        parser_output = template_soup.find("span",
                                           {"class": ["term", "texte"]})
        rendering = parser_output.text
        if template_name and rendering:
            results[template_name] = rendering.strip("()")

    return nextpage

Beispiel #9

0

Datei anzeigen

Datei: en-labels.py Projekt: BoboTiG/ebook-reader-dict

def process_page(url, repl, stop_line, var_name, print_result=True):
    soup = get_soup(url)
    div = soup.find("div", {"class": "mw-highlight-lines"})
    text = div.text

    text = text.replace("local ", "")
    text = text.replace("end", "")
    text = text.replace("true", "True")
    text = text.replace("false", "False")
    text = text.replace("--", "#")

    text = re.sub(r"function\s+(\w+\([\w|\,]+\))", "def \\g<1>:", text)
    text = text.replace("for _,v in ipairs(y) do", "for v in y:")

    for r in repl:
        text = re.sub(rf"[ \t]+{r}[\s]*=", f'    "{r}":', text)

    code = ""
    for line in text.split("\n"):
        if line.strip().startswith(stop_line):
            break
        elif "require" not in line:
            code += line + "\n"

    exec(code, globals())
    results = {}

    for k, v in labels.items():  # noqa
        label_v = v
        label_k = k
        if isinstance(v, str):
            label_v = labels.get(v, v)  # noqa
            if label_v != v:
                label_k = v
        if isinstance(label_v, str):
            display = label_v
        else:
            display = label_v.get("display", label_k)
        display = process_display(display)
        if display != k:
            results[k] = display
    if print_result:
        print(f"{var_name} = {{")
        for key, value in sorted(results.items()):
            print(f'    "{key}": "{value}",')
        print(f"}}  # {len(results):,}")
    return results

Beispiel #10

0

Datei anzeigen

from scripts_utils import get_soup

root_url = "https://de.wiktionary.org"
start_url = f"{root_url}/wiki/Kategorie:Wiktionary:Sprachadjektive"
alias_url = "https://de.wiktionary.org/w/index.php?title=Spezial:Linkliste/{}&hidetrans=1&hidelinks=1"
soup = get_soup(start_url)

content = soup.find("div", {"class": "mw-category"})
lis = content.findAll("li")
languages = {}
for li in lis:
    link = li.find("a")["href"]
    li_url = root_url + link
    key = li.text.split(":")[1]
    sub_soup = get_soup(li_url)
    content = sub_soup.find("div", {"class": "mw-parser-output"}).find("p")
    value = content.text.strip()
    languages[key] = value
    a_url = alias_url.format(li.text)
    soup_alias = get_soup(a_url)
    if ul_alias := soup_alias.find("ul", {"id": "mw-whatlinkshere-list"}):
        for alias_li in ul_alias.findAll("li"):
            alias_text = alias_li.find("a").text
            alias_key = alias_text.split(":")[1]
            languages[alias_key] = value

print("lang_adjs = {")
for key, value in sorted(languages.items()):
    print(f'    "{key}": "{value}",')
print(f"}}  # {len(languages):,}")

Beispiel #11

0

Datei anzeigen

Datei: pt-gramatica.py Projekt: BoboTiG/ebook-reader-dict

import re
from scripts_utils import get_soup

url = "https://pt.wiktionary.org/w/index.php?title=Predefini%C3%A7%C3%A3o:gram%C3%A1tica/core&action=edit"
soup = get_soup(url)
textarea = soup.find("textarea")

current_abbr = []
count = 0

text = textarea.text
text = text.replace("{{#ifeq:{{int:Log}}|{{:MediaWiki:Log}}|género|gênero}}",
                    "género")
text = text.replace("{{gramática/core/faltagenero|{{{2|}}}}}",
                    "gênero em falta")
text = re.sub("(<!--.*?-->)", "", text, flags=re.DOTALL)

print("gramatica_short = {")
for p in sorted(text.split("|")):
    p = p.strip()
    if p and "<!--" not in p and ("{" not in p) and ("}" not in p):
        if "=" in p:
            sArray = p.split("=")
            name = sArray[1].strip("'")
            print(f'    "{sArray[0]}": "{name}",')
            count += 1
            for abbr in sorted(current_abbr):
                print(f'    "{abbr}": "{name}",')
                count += 1
            current_abbr.clear()
        else: