Beispiel #1
0
def extract_page_xmls(f):
    """Extract pages from a MediaWiki database dump.

    Parameters
    ----------
    f : file
        File descriptor of MediaWiki dump.

    Yields
    ------
    str
        XML strings for page tags.

    """
    elems = (elem for _, elem in cElementTree.iterparse(f, events=("end", )))

    elem = next(elems)
    namespace = get_namespace(elem.tag)
    ns_mapping = {"ns": namespace}
    page_tag = "{%(ns)s}page" % ns_mapping

    for elem in elems:
        if elem.tag == page_tag:
            yield cElementTree.tostring(elem)
            # Prune the element tree, as per
            # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
            # except that we don't need to prune backlinks from the parent
            # because we don't use LXML.
            # We do this only for <page>s, since we need to inspect the
            # ./revision/text element. The pages comprise the bulk of the
            # file, so in practice we prune away enough.
            elem.clear()
def extract_pages_without_namespaces(f, filter_namespaces=False):

    elems = (elem for _, elem in iterparse(f, events=("end", )))

    elem = next(elems)
    namespace = get_namespace(elem.tag)
    ns_mapping = {"ns": namespace}
    page_tag = "{%(ns)s}page" % ns_mapping
    text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
    title_path = "./{%(ns)s}title" % ns_mapping
    ns_path = "./{%(ns)s}ns" % ns_mapping
    pageid_path = "./{%(ns)s}id" % ns_mapping

    for elem in elems:
        if elem.tag == page_tag:
            title = elem.find(title_path).text
            text = elem.find(text_path).text

            if filter_namespaces:
                #ns = elem.find(ns_path).text
                res = elem.find(ns_path)
                ns = res.text if res else '0'
                if ns not in filter_namespaces:
                    text = None

            pageid = elem.find(pageid_path).text
            yield title, text or "", pageid  # empty page will yield None

            elem.clear()
Beispiel #3
0
def process_stream(args):
    path, vocab = args

    page_entity_word_co_occur = dict()
    context_entity_word_co_occur = dict()
    word_count = dict()

    xml_iterator = iterparse(bz2.BZ2File(path), events=("end", ))
    page_counter = 0
    for elem_idx, (_, elem) in enumerate(xml_iterator):
        if elem_idx == 0:
            namespace = get_namespace(elem.tag)
            ns_mapping = {"ns": namespace}
            page_tag = "{%(ns)s}page" % ns_mapping
            text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
            title_path = "./{%(ns)s}title" % ns_mapping
        if elem.tag == page_tag:
            title = elem.find(title_path).text.replace(' ', '_')
            text = elem.find(text_path).text
            if text is None:
                text = ""
            if parse_page(title,
                          text,
                          vocab,
                          page_entity_word_co_occur,
                          context_entity_word_co_occur,
                          word_count,
                          entity_window_size=20):
                page_counter += 1
            elem.clear()
    return (page_entity_word_co_occur, context_entity_word_co_occur,
            word_count, page_counter)
Beispiel #4
0
def extract_page_xmls(f):
    """Extract pages from a MediaWiki database dump.

    Parameters
    ----------
    f : file
        File descriptor of MediaWiki dump.

    Yields
    ------
    str
        XML strings for page tags.

    """
    elems = (elem for _, elem in cElementTree.iterparse(f, events=("end",)))

    elem = next(elems)
    namespace = get_namespace(elem.tag)
    ns_mapping = {"ns": namespace}
    page_tag = "{%(ns)s}page" % ns_mapping

    for elem in elems:
        if elem.tag == page_tag:
            yield cElementTree.tostring(elem)
            # Prune the element tree, as per
            # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
            # except that we don't need to prune backlinks from the parent
            # because we don't use LXML.
            # We do this only for <page>s, since we need to inspect the
            # ./revision/text element. The pages comprise the bulk of the
            # file, so in practice we prune away enough.
            elem.clear()
Beispiel #5
0
def segment(page_xml, include_interlinks=False):
    """Parse the content inside a page tag

    Parameters
    ----------
    page_xml : str
        Content from page tag.

    include_interlinks : bool
        Whether or not interlinks should be parsed.

    Returns
    -------
    (str, list of (str, str), (Optionally) dict of (str: str))
        Structure contains (title, [(section_heading, section_content), ...], (Optionally) {interlinks}).

    """
    elem = cElementTree.fromstring(page_xml)
    filter_namespaces = ('0', )
    namespace = get_namespace(elem.tag)
    ns_mapping = {"ns": namespace}
    text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
    title_path = "./{%(ns)s}title" % ns_mapping
    id_path = "./{%(ns)s}id" % ns_mapping
    ns_path = "./{%(ns)s}ns" % ns_mapping
    lead_section_heading = "Introduction"
    top_level_heading_regex = r"\n==[^=].*[^=]==\n"
    top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n"

    title = elem.find(title_path).text
    articleID = elem.find(id_path).text
    text = elem.find(text_path).text
    ns = elem.find(ns_path).text
    if ns not in filter_namespaces:
        text = None

    if text is not None:
        if include_interlinks:
            interlinks = find_interlinks(text)
        section_contents = re.split(top_level_heading_regex, text)
        section_headings = [lead_section_heading] + re.findall(
            top_level_heading_regex_capture, text)
        section_headings = [heading.strip() for heading in section_headings]
        assert len(section_contents) == len(section_headings)
    else:
        interlinks = []
        section_contents = []
        section_headings = []

    section_contents = [
        filter_wiki(section_content) for section_content in section_contents
    ]
    sections = list(zip(section_headings, section_contents))

    if include_interlinks:
        return title, sections, interlinks, articleID
    else:
        return title, sections, articleID
Beispiel #6
0
def segment(page_xml, include_interlinks=False):
    """Parse the content inside a page tag

    Parameters
    ----------
    page_xml : str
        Content from page tag.

    include_interlinks : bool
        Whether or not interlinks should be parsed.

    Returns
    -------
    (str, list of (str, str), (Optionally) dict of (str: str))
        Structure contains (title, [(section_heading, section_content), ...], (Optionally) {interlinks}).

    """
    elem = cElementTree.fromstring(page_xml)
    filter_namespaces = ('0',)
    namespace = get_namespace(elem.tag)
    ns_mapping = {"ns": namespace}
    text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
    title_path = "./{%(ns)s}title" % ns_mapping
    ns_path = "./{%(ns)s}ns" % ns_mapping
    lead_section_heading = "Introduction"
    top_level_heading_regex = r"\n==[^=].*[^=]==\n"
    top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n"

    title = elem.find(title_path).text
    text = elem.find(text_path).text
    ns = elem.find(ns_path).text
    if ns not in filter_namespaces:
        text = None

    if text is not None:
        if include_interlinks:
            interlinks = find_interlinks(text)
        section_contents = re.split(top_level_heading_regex, text)
        section_headings = [lead_section_heading] + re.findall(top_level_heading_regex_capture, text)
        section_headings = [heading.strip() for heading in section_headings]
        assert len(section_contents) == len(section_headings)
    else:
        interlinks = []
        section_contents = []
        section_headings = []

    section_contents = [filter_wiki(section_content) for section_content in section_contents]
    sections = list(zip(section_headings, section_contents))

    if include_interlinks:
        return title, sections, interlinks
    else:
        return title, sections
Beispiel #7
0
def process_dump(
    page_xml: Any,
) -> Dict[str, Union[Union[str, None, List[Dict[str, Union[str, Any]]], list], Any]]:
    """Processes the page xml and returns a Dict with title, sections and links

    Parameters
    ----------
    page_xml : Any
        XML extracted from Wikipedia dump

    Returns
    -------
        Dict
    """
    # Parse XML element
    elem = cElementTree.fromstring(page_xml)
    filter_namespaces = ("0",)
    namespace = get_namespace(elem.tag)
    ns_mapping = {"ns": namespace}
    text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
    title_path = "./{%(ns)s}title" % ns_mapping
    ns_path = "./{%(ns)s}ns" % ns_mapping

    title = elem.find(title_path).text

    text = elem.find(text_path).text
    ns = elem.find(ns_path).text

    # Get the article name of the links in the article
    links_in_document = list(segment(page_xml, include_interlinks=True)[2])

    # Filter invalid namespaces (user pages, etc)
    if ns not in filter_namespaces:
        return

    # Ignore redirects
    if "#REDIRECT" in text or "#redirect" in text or "#Redirect" in text:
        return

    # TODO (optional) should transform this into a data class
    return {
        "title": title,
        "sections": process_text(text),
        "links": links_in_document,
    }
Beispiel #8
0
def segment(page_xml):
    """Parse the content inside a page tag

    Parameters
    ----------
    page_xml : str
        Content from page tag.

    Returns
    -------
    (str, list of (str, str))
        Structure contains (title, [(section_heading, section_content)]).

    """
    elem = cElementTree.fromstring(page_xml)
    filter_namespaces = ('0', )
    namespace = get_namespace(elem.tag)
    ns_mapping = {"ns": namespace}
    text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
    title_path = "./{%(ns)s}title" % ns_mapping
    ns_path = "./{%(ns)s}ns" % ns_mapping
    lead_section_heading = "Introduction"
    top_level_heading_regex = r"\n==[^=].*[^=]==\n"
    top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n"

    title = elem.find(title_path).text
    text = elem.find(text_path).text
    ns = elem.find(ns_path).text
    if ns not in filter_namespaces:
        text = None

    if text is not None:
        section_contents = re.split(top_level_heading_regex, text)
        section_headings = [lead_section_heading] + re.findall(
            top_level_heading_regex_capture, text)
        assert len(section_contents) == len(section_headings)
    else:
        section_contents = []
        section_headings = []

    section_contents = [
        filter_wiki(section_content) for section_content in section_contents
    ]
    sections = list(zip(section_headings, section_contents))
    return title, sections
Beispiel #9
0
def segment(page_xml):
    """Parse the content inside a page tag

    Parameters
    ----------
    page_xml : str
        Content from page tag.

    Returns
    -------
    (str, list of (str, str))
        Structure contains (title, [(section_heading, section_content)]).

    """
    elem = cElementTree.fromstring(page_xml)
    filter_namespaces = ('0',)
    namespace = get_namespace(elem.tag)
    ns_mapping = {"ns": namespace}
    text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
    title_path = "./{%(ns)s}title" % ns_mapping
    ns_path = "./{%(ns)s}ns" % ns_mapping
    lead_section_heading = "Introduction"
    top_level_heading_regex = r"\n==[^=].*[^=]==\n"
    top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n"

    title = elem.find(title_path).text
    text = elem.find(text_path).text
    ns = elem.find(ns_path).text
    if ns not in filter_namespaces:
        text = None

    if text is not None:
        section_contents = re.split(top_level_heading_regex, text)
        section_headings = [lead_section_heading] + re.findall(top_level_heading_regex_capture, text)
        assert len(section_contents) == len(section_headings)
    else:
        section_contents = []
        section_headings = []

    section_contents = [filter_wiki(section_content) for section_content in section_contents]
    sections = list(zip(section_headings, section_contents))
    return title, sections
Beispiel #10
0
def extract_pages(f, filter_namespaces=False, filter_articles=None):
    try:
        elems = (elem for _, elem in iterparse(f, events=("end", )))
    except ParseError:
        yield None, "", None

    elem = next(elems)
    namespace = get_namespace(elem.tag)
    ns_mapping = {"ns": namespace}
    page_tag = "{%(ns)s}page" % ns_mapping
    text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
    title_path = "./{%(ns)s}title" % ns_mapping
    ns_path = "./{%(ns)s}ns" % ns_mapping
    pageid_path = "./{%(ns)s}id" % ns_mapping

    for elem in elems:
        if elem.tag == page_tag:
            title = elem.find(title_path).text
            text = elem.find(text_path).text

            if filter_namespaces:
                ns = elem.find(ns_path).text
                if ns not in filter_namespaces:
                    text = None

            if filter_articles is not None:
                if not filter_articles(elem,
                                       namespace=namespace,
                                       title=title,
                                       text=text,
                                       page_tag=page_tag,
                                       text_path=text_path,
                                       title_path=title_path,
                                       ns_path=ns_path,
                                       pageid_path=pageid_path):
                    text = None

            pageid = elem.find(pageid_path).text
            yield title, text or "", pageid  # empty page will yield None

            elem.clear()
Beispiel #11
0
def segment(page_xml, include_interlinks=False):
    """Parse the content inside a page tag
    Parameters
    ----------
    page_xml : str
        Content from page tag.
    include_interlinks : bool
        Whether or not interlinks should be parsed.
    Returns
    -------
    (str, list of (str, str), (Optionally) list of (str, str))
        Structure contains (title, [(section_heading, section_content), ...],
        (Optionally) [(interlink_article, interlink_text), ...]).
    """
    elem = cElementTree.fromstring(page_xml)
    filter_namespaces = ('0', )
    namespace = get_namespace(elem.tag)
    ns_mapping = {"ns": namespace}
    text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
    title_path = "./{%(ns)s}title" % ns_mapping
    ns_path = "./{%(ns)s}ns" % ns_mapping
    lead_section_heading = "Introduction"
    top_level_heading_regex = r"\n==[^=].*[^=]==\n"
    top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n"

    title = elem.find(title_path).text
    text = elem.find(text_path).text
    ns = elem.find(ns_path).text
    if ns not in filter_namespaces:
        text = None

    if text is not None:
        coordinates = extract_coordinates(text)
        if len(coordinates) > 2:
            latitude = (
                coordinates[0] + coordinates[1] / 60.0 +
                coordinates[2] / 3600.00) * SIGNS[coordinates[3].upper()]
            longitude = (
                coordinates[4] + coordinates[5] / 60.0 +
                coordinates[6] / 3600.00) * SIGNS[coordinates[7].upper()]
            coordinates = (latitude, longitude)
        if include_interlinks:
            interlinks = find_interlinks(text)
        section_contents = re.split(top_level_heading_regex, text)
        section_headings = [lead_section_heading] + re.findall(
            top_level_heading_regex_capture, text)
        section_headings = [heading.strip() for heading in section_headings]
        assert len(section_contents) == len(section_headings)
    else:
        interlinks = []
        section_contents = []
        section_headings = []
        coordinates = None

    section_contents = [
        filter_wiki(section_content) for section_content in section_contents
    ]
    sections = list(zip(section_headings, section_contents))

    if include_interlinks:
        return title, sections, coordinates, interlinks
    else:
        return title, sections, coordinates
Beispiel #12
0
def get_pages_from_wiki_dump(wiki_dump_path, max_doc_count=0):

    sources_translations = ["quellen", "sources", "quelle", "source"]

    category_pattern = re.compile(
        "\[\[(Category|Kategorie|Catégorie):(.*?)\]\]")
    footnote_pattern = re.compile(r"==(.+?)==(.+?)\n *\n", flags=re.DOTALL)
    url_pattern = re.compile(r"https?://[^\s|\]]+")
    blank_pattern = re.compile(r"^\s*$")

    with open(wiki_dump_path, "rb") as xml_fileobj:
        page_xmls = extract_page_xmls(xml_fileobj)
        i = 0
        wrong_ns = 0
        no_sources = 0
        no_text = 0
        redirect = 0

        docs = []

        for i, page_xml in enumerate(page_xmls):

            elem = cElementTree.fromstring(page_xml)
            filter_namespaces = ("0", )
            namespace = get_namespace(elem.tag)
            ns_mapping = {"ns": namespace}
            text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
            title_path = "./{%(ns)s}title" % ns_mapping
            ns_path = "./{%(ns)s}ns" % ns_mapping

            title = elem.find(title_path).text
            text = elem.find(text_path).text
            ns = elem.find(ns_path).text
            if ns not in filter_namespaces:
                wrong_ns += 1
                continue

            try:

                categories = [c for _, c in category_pattern.findall(text)]

                sources = find_sources(text, sources_translations,
                                       footnote_pattern, url_pattern)

                cleaned_text = category_pattern.sub("", text)
                cleaned_text = footnote_pattern.sub("", cleaned_text)
                cleaned_text = filter_wiki(cleaned_text)
                passages = [
                    passage for passage in cleaned_text.split("\n\n")
                    if blank_pattern.match(passage) == None
                ]

                sources = clean_sources(sources)

                if len(" ".join(passages).split()) == 0:
                    no_text += 1
                    continue

                if "#REDIRECT" in cleaned_text or "#redirect" in cleaned_text:
                    redirect += 1
                    continue

                if sources == []:
                    no_sources += 1
                    continue

                docs.append({
                    "title": title,
                    "text": passages,
                    "categories": categories,
                    "sources": sources,
                })

                if 0 < max_doc_count < len(docs):
                    break
            except (TypeError, ValueError) as e:
                logger.error(f"Cannot read page #{i} - {title}: {e}")

    print(
        "Pages read: {}\nPages returned: {}\nWrong namespace: {}\nNo sources: {}\nNo text: {}\nRedirect: {}"
        .format(i + 1, len(docs), wrong_ns, no_sources, no_text, redirect))

    return docs