def extract_page_xmls(f): """Extract pages from a MediaWiki database dump. Parameters ---------- f : file File descriptor of MediaWiki dump. Yields ------ str XML strings for page tags. """ elems = (elem for _, elem in cElementTree.iterparse(f, events=("end", ))) elem = next(elems) namespace = get_namespace(elem.tag) ns_mapping = {"ns": namespace} page_tag = "{%(ns)s}page" % ns_mapping for elem in elems: if elem.tag == page_tag: yield cElementTree.tostring(elem) # Prune the element tree, as per # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ # except that we don't need to prune backlinks from the parent # because we don't use LXML. # We do this only for <page>s, since we need to inspect the # ./revision/text element. The pages comprise the bulk of the # file, so in practice we prune away enough. elem.clear()
def extract_pages_without_namespaces(f, filter_namespaces=False): elems = (elem for _, elem in iterparse(f, events=("end", ))) elem = next(elems) namespace = get_namespace(elem.tag) ns_mapping = {"ns": namespace} page_tag = "{%(ns)s}page" % ns_mapping text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping title_path = "./{%(ns)s}title" % ns_mapping ns_path = "./{%(ns)s}ns" % ns_mapping pageid_path = "./{%(ns)s}id" % ns_mapping for elem in elems: if elem.tag == page_tag: title = elem.find(title_path).text text = elem.find(text_path).text if filter_namespaces: #ns = elem.find(ns_path).text res = elem.find(ns_path) ns = res.text if res else '0' if ns not in filter_namespaces: text = None pageid = elem.find(pageid_path).text yield title, text or "", pageid # empty page will yield None elem.clear()
def process_stream(args): path, vocab = args page_entity_word_co_occur = dict() context_entity_word_co_occur = dict() word_count = dict() xml_iterator = iterparse(bz2.BZ2File(path), events=("end", )) page_counter = 0 for elem_idx, (_, elem) in enumerate(xml_iterator): if elem_idx == 0: namespace = get_namespace(elem.tag) ns_mapping = {"ns": namespace} page_tag = "{%(ns)s}page" % ns_mapping text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping title_path = "./{%(ns)s}title" % ns_mapping if elem.tag == page_tag: title = elem.find(title_path).text.replace(' ', '_') text = elem.find(text_path).text if text is None: text = "" if parse_page(title, text, vocab, page_entity_word_co_occur, context_entity_word_co_occur, word_count, entity_window_size=20): page_counter += 1 elem.clear() return (page_entity_word_co_occur, context_entity_word_co_occur, word_count, page_counter)
def extract_page_xmls(f): """Extract pages from a MediaWiki database dump. Parameters ---------- f : file File descriptor of MediaWiki dump. Yields ------ str XML strings for page tags. """ elems = (elem for _, elem in cElementTree.iterparse(f, events=("end",))) elem = next(elems) namespace = get_namespace(elem.tag) ns_mapping = {"ns": namespace} page_tag = "{%(ns)s}page" % ns_mapping for elem in elems: if elem.tag == page_tag: yield cElementTree.tostring(elem) # Prune the element tree, as per # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ # except that we don't need to prune backlinks from the parent # because we don't use LXML. # We do this only for <page>s, since we need to inspect the # ./revision/text element. The pages comprise the bulk of the # file, so in practice we prune away enough. elem.clear()
def segment(page_xml, include_interlinks=False): """Parse the content inside a page tag Parameters ---------- page_xml : str Content from page tag. include_interlinks : bool Whether or not interlinks should be parsed. Returns ------- (str, list of (str, str), (Optionally) dict of (str: str)) Structure contains (title, [(section_heading, section_content), ...], (Optionally) {interlinks}). """ elem = cElementTree.fromstring(page_xml) filter_namespaces = ('0', ) namespace = get_namespace(elem.tag) ns_mapping = {"ns": namespace} text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping title_path = "./{%(ns)s}title" % ns_mapping id_path = "./{%(ns)s}id" % ns_mapping ns_path = "./{%(ns)s}ns" % ns_mapping lead_section_heading = "Introduction" top_level_heading_regex = r"\n==[^=].*[^=]==\n" top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n" title = elem.find(title_path).text articleID = elem.find(id_path).text text = elem.find(text_path).text ns = elem.find(ns_path).text if ns not in filter_namespaces: text = None if text is not None: if include_interlinks: interlinks = find_interlinks(text) section_contents = re.split(top_level_heading_regex, text) section_headings = [lead_section_heading] + re.findall( top_level_heading_regex_capture, text) section_headings = [heading.strip() for heading in section_headings] assert len(section_contents) == len(section_headings) else: interlinks = [] section_contents = [] section_headings = [] section_contents = [ filter_wiki(section_content) for section_content in section_contents ] sections = list(zip(section_headings, section_contents)) if include_interlinks: return title, sections, interlinks, articleID else: return title, sections, articleID
def segment(page_xml, include_interlinks=False): """Parse the content inside a page tag Parameters ---------- page_xml : str Content from page tag. include_interlinks : bool Whether or not interlinks should be parsed. Returns ------- (str, list of (str, str), (Optionally) dict of (str: str)) Structure contains (title, [(section_heading, section_content), ...], (Optionally) {interlinks}). """ elem = cElementTree.fromstring(page_xml) filter_namespaces = ('0',) namespace = get_namespace(elem.tag) ns_mapping = {"ns": namespace} text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping title_path = "./{%(ns)s}title" % ns_mapping ns_path = "./{%(ns)s}ns" % ns_mapping lead_section_heading = "Introduction" top_level_heading_regex = r"\n==[^=].*[^=]==\n" top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n" title = elem.find(title_path).text text = elem.find(text_path).text ns = elem.find(ns_path).text if ns not in filter_namespaces: text = None if text is not None: if include_interlinks: interlinks = find_interlinks(text) section_contents = re.split(top_level_heading_regex, text) section_headings = [lead_section_heading] + re.findall(top_level_heading_regex_capture, text) section_headings = [heading.strip() for heading in section_headings] assert len(section_contents) == len(section_headings) else: interlinks = [] section_contents = [] section_headings = [] section_contents = [filter_wiki(section_content) for section_content in section_contents] sections = list(zip(section_headings, section_contents)) if include_interlinks: return title, sections, interlinks else: return title, sections
def process_dump( page_xml: Any, ) -> Dict[str, Union[Union[str, None, List[Dict[str, Union[str, Any]]], list], Any]]: """Processes the page xml and returns a Dict with title, sections and links Parameters ---------- page_xml : Any XML extracted from Wikipedia dump Returns ------- Dict """ # Parse XML element elem = cElementTree.fromstring(page_xml) filter_namespaces = ("0",) namespace = get_namespace(elem.tag) ns_mapping = {"ns": namespace} text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping title_path = "./{%(ns)s}title" % ns_mapping ns_path = "./{%(ns)s}ns" % ns_mapping title = elem.find(title_path).text text = elem.find(text_path).text ns = elem.find(ns_path).text # Get the article name of the links in the article links_in_document = list(segment(page_xml, include_interlinks=True)[2]) # Filter invalid namespaces (user pages, etc) if ns not in filter_namespaces: return # Ignore redirects if "#REDIRECT" in text or "#redirect" in text or "#Redirect" in text: return # TODO (optional) should transform this into a data class return { "title": title, "sections": process_text(text), "links": links_in_document, }
def segment(page_xml): """Parse the content inside a page tag Parameters ---------- page_xml : str Content from page tag. Returns ------- (str, list of (str, str)) Structure contains (title, [(section_heading, section_content)]). """ elem = cElementTree.fromstring(page_xml) filter_namespaces = ('0', ) namespace = get_namespace(elem.tag) ns_mapping = {"ns": namespace} text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping title_path = "./{%(ns)s}title" % ns_mapping ns_path = "./{%(ns)s}ns" % ns_mapping lead_section_heading = "Introduction" top_level_heading_regex = r"\n==[^=].*[^=]==\n" top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n" title = elem.find(title_path).text text = elem.find(text_path).text ns = elem.find(ns_path).text if ns not in filter_namespaces: text = None if text is not None: section_contents = re.split(top_level_heading_regex, text) section_headings = [lead_section_heading] + re.findall( top_level_heading_regex_capture, text) assert len(section_contents) == len(section_headings) else: section_contents = [] section_headings = [] section_contents = [ filter_wiki(section_content) for section_content in section_contents ] sections = list(zip(section_headings, section_contents)) return title, sections
def segment(page_xml): """Parse the content inside a page tag Parameters ---------- page_xml : str Content from page tag. Returns ------- (str, list of (str, str)) Structure contains (title, [(section_heading, section_content)]). """ elem = cElementTree.fromstring(page_xml) filter_namespaces = ('0',) namespace = get_namespace(elem.tag) ns_mapping = {"ns": namespace} text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping title_path = "./{%(ns)s}title" % ns_mapping ns_path = "./{%(ns)s}ns" % ns_mapping lead_section_heading = "Introduction" top_level_heading_regex = r"\n==[^=].*[^=]==\n" top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n" title = elem.find(title_path).text text = elem.find(text_path).text ns = elem.find(ns_path).text if ns not in filter_namespaces: text = None if text is not None: section_contents = re.split(top_level_heading_regex, text) section_headings = [lead_section_heading] + re.findall(top_level_heading_regex_capture, text) assert len(section_contents) == len(section_headings) else: section_contents = [] section_headings = [] section_contents = [filter_wiki(section_content) for section_content in section_contents] sections = list(zip(section_headings, section_contents)) return title, sections
def extract_pages(f, filter_namespaces=False, filter_articles=None): try: elems = (elem for _, elem in iterparse(f, events=("end", ))) except ParseError: yield None, "", None elem = next(elems) namespace = get_namespace(elem.tag) ns_mapping = {"ns": namespace} page_tag = "{%(ns)s}page" % ns_mapping text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping title_path = "./{%(ns)s}title" % ns_mapping ns_path = "./{%(ns)s}ns" % ns_mapping pageid_path = "./{%(ns)s}id" % ns_mapping for elem in elems: if elem.tag == page_tag: title = elem.find(title_path).text text = elem.find(text_path).text if filter_namespaces: ns = elem.find(ns_path).text if ns not in filter_namespaces: text = None if filter_articles is not None: if not filter_articles(elem, namespace=namespace, title=title, text=text, page_tag=page_tag, text_path=text_path, title_path=title_path, ns_path=ns_path, pageid_path=pageid_path): text = None pageid = elem.find(pageid_path).text yield title, text or "", pageid # empty page will yield None elem.clear()
def segment(page_xml, include_interlinks=False): """Parse the content inside a page tag Parameters ---------- page_xml : str Content from page tag. include_interlinks : bool Whether or not interlinks should be parsed. Returns ------- (str, list of (str, str), (Optionally) list of (str, str)) Structure contains (title, [(section_heading, section_content), ...], (Optionally) [(interlink_article, interlink_text), ...]). """ elem = cElementTree.fromstring(page_xml) filter_namespaces = ('0', ) namespace = get_namespace(elem.tag) ns_mapping = {"ns": namespace} text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping title_path = "./{%(ns)s}title" % ns_mapping ns_path = "./{%(ns)s}ns" % ns_mapping lead_section_heading = "Introduction" top_level_heading_regex = r"\n==[^=].*[^=]==\n" top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n" title = elem.find(title_path).text text = elem.find(text_path).text ns = elem.find(ns_path).text if ns not in filter_namespaces: text = None if text is not None: coordinates = extract_coordinates(text) if len(coordinates) > 2: latitude = ( coordinates[0] + coordinates[1] / 60.0 + coordinates[2] / 3600.00) * SIGNS[coordinates[3].upper()] longitude = ( coordinates[4] + coordinates[5] / 60.0 + coordinates[6] / 3600.00) * SIGNS[coordinates[7].upper()] coordinates = (latitude, longitude) if include_interlinks: interlinks = find_interlinks(text) section_contents = re.split(top_level_heading_regex, text) section_headings = [lead_section_heading] + re.findall( top_level_heading_regex_capture, text) section_headings = [heading.strip() for heading in section_headings] assert len(section_contents) == len(section_headings) else: interlinks = [] section_contents = [] section_headings = [] coordinates = None section_contents = [ filter_wiki(section_content) for section_content in section_contents ] sections = list(zip(section_headings, section_contents)) if include_interlinks: return title, sections, coordinates, interlinks else: return title, sections, coordinates
def get_pages_from_wiki_dump(wiki_dump_path, max_doc_count=0): sources_translations = ["quellen", "sources", "quelle", "source"] category_pattern = re.compile( "\[\[(Category|Kategorie|Catégorie):(.*?)\]\]") footnote_pattern = re.compile(r"==(.+?)==(.+?)\n *\n", flags=re.DOTALL) url_pattern = re.compile(r"https?://[^\s|\]]+") blank_pattern = re.compile(r"^\s*$") with open(wiki_dump_path, "rb") as xml_fileobj: page_xmls = extract_page_xmls(xml_fileobj) i = 0 wrong_ns = 0 no_sources = 0 no_text = 0 redirect = 0 docs = [] for i, page_xml in enumerate(page_xmls): elem = cElementTree.fromstring(page_xml) filter_namespaces = ("0", ) namespace = get_namespace(elem.tag) ns_mapping = {"ns": namespace} text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping title_path = "./{%(ns)s}title" % ns_mapping ns_path = "./{%(ns)s}ns" % ns_mapping title = elem.find(title_path).text text = elem.find(text_path).text ns = elem.find(ns_path).text if ns not in filter_namespaces: wrong_ns += 1 continue try: categories = [c for _, c in category_pattern.findall(text)] sources = find_sources(text, sources_translations, footnote_pattern, url_pattern) cleaned_text = category_pattern.sub("", text) cleaned_text = footnote_pattern.sub("", cleaned_text) cleaned_text = filter_wiki(cleaned_text) passages = [ passage for passage in cleaned_text.split("\n\n") if blank_pattern.match(passage) == None ] sources = clean_sources(sources) if len(" ".join(passages).split()) == 0: no_text += 1 continue if "#REDIRECT" in cleaned_text or "#redirect" in cleaned_text: redirect += 1 continue if sources == []: no_sources += 1 continue docs.append({ "title": title, "text": passages, "categories": categories, "sources": sources, }) if 0 < max_doc_count < len(docs): break except (TypeError, ValueError) as e: logger.error(f"Cannot read page #{i} - {title}: {e}") print( "Pages read: {}\nPages returned: {}\nWrong namespace: {}\nNo sources: {}\nNo text: {}\nRedirect: {}" .format(i + 1, len(docs), wrong_ns, no_sources, no_text, redirect)) return docs