def get_simple_text(wtext, key, clean=True):

    text = None

    keys = key if type(key) is list else [key]

    template_dict = adapter.template_dict(wtext)
    wtext_lines = wtext_help.get_wtext_lines(wtext)

    if keys:
        for possible_key in keys:

            # try getting from parserfromhell
            if not text and template_dict:
                text = template_dict.get(possible_key)

            # final attempt if still no text
            if not text and wtext_lines:
                matched_line = wtext_help.find_key_val_line(wtext, possible_key)
                if matched_line:
                    key_val = matched_line.strip(' \t\n\r').split("=", 1)
                    if len(key_val) == 2:
                        text = key_val[1].strip()

    if text and clean:
        text = clean_help.clean_text(text)

    return text
def extract_page_links(wtext, key):

    links = []

    keys = key if type(key) is list else [key]

    template_dict = adapter.template_dict(wtext)
    wtext_lines = wtext_help.get_wtext_lines(wtext)

    if keys:
        for possible_key in keys:

            # try parserfromhell
            if not links and template_dict:
                if template_dict.get(possible_key):
                    matches = re.findall("\[\[(.*?)\]\]", template_dict.get(possible_key))
                    links = [link.split("|", 1)[0] for link in matches]

            # final attempt if still no links
            if not links and wtext_lines:
                matched_line = wtext_help.find_key_val_line(wtext_lines, possible_key)
                if matched_line:
                    key_val = matched_line.strip(' \t\n\r').split("=")
                    if len(key_val) == 2:
                        matches = re.findall("\[\[(.*?)\]\]", key_val[1].strip())
                        links = [link.split("|", 1)[0] for link in matches]

    return links
Beispiel #3
0
def extract_page_name(wtext):

    name = None

    wtext_lines = wtext_help.get_wtext_lines(wtext)

    for line in wtext_lines:
        if not name:
            if line.startswith("\'\'") and not name:
                pieces = line.split("\'\'")
                try:
                    name = next(piece for piece in pieces if len(piece) >= 1)
                except:
                    pass
        else:
            break

    return name
Beispiel #4
0
def extract_infobox_name(wtext):

    name = None

    wtext_lines = wtext_help.get_wtext_lines(wtext)

    if wtext_lines:
        matched_line = None
        try:
            matched_line = next(line.strip() for line in wtext_lines
                                if line.strip().startswith("{{infobox")
                                or line.strip().startswith("{{Infobox"))
        except:
            pass

        if matched_line:
            line = matched_line.strip(' \t\n\r{').lower()
            name = clean_help.remove_html(line.replace("|", ""))

    return name