def _force_unicode(s): """Force ``s`` into unicode, or die trying.""" if isinstance(s, unicode_): return s elif isinstance(s, bytes_): return bytes_to_unicode(s) else: return unicode_(s)
def coerce_content_type(content, file_mode): """ If the `content` to be written to file and the `file_mode` used to open it are incompatible (either bytes with text mode or unicode with bytes mode), try to coerce the content type so it can be written. """ if 't' in file_mode and isinstance(content, compat.bytes_): return compat.bytes_to_unicode(content) elif 'b' in file_mode and isinstance(content, compat.unicode_): return compat.unicode_to_bytes(content) return content
def coerce_content_type(content, file_mode): """ If the `content` to be written to file and the `file_mode` used to open it are incompatible (either bytes with text mode or unicode with bytes mode), try to coerce the content type so it can be written. """ if 't' in file_mode and isinstance(content, compat.bytes_type): return compat.bytes_to_unicode(content) elif 'b' in file_mode and isinstance(content, compat.unicode_type): return compat.unicode_to_bytes(content) return content
def __iter__(self): """ Iterate over the pages of a Wikipedia articles database dump (*articles.xml.bz2), yielding one (page id, page title, page content) 3-tuple at a time. Yields: Tuple[str, str, str]: page id, title, content with wikimedia markup """ if not self.filename: raise IOError('{} file not found'.format(self._filename)) if compat.is_python2 is False: events = ('end', ) f = fileio.open_sesame(self.filename, mode='rt') else: # Python 2 can't open bzip in text mode :( events = (b'end', ) f = fileio.open_sesame(self.filename, mode='rb') with f: elems = (elem for _, elem in iterparse(f, events=events)) elem = next(elems) match = re.match('^{(.*?)}', elem.tag) namespace = match.group(1) if match else '' if not namespace.startswith( 'http://www.mediawiki.org/xml/export-'): raise ValueError( 'namespace "{}" not a valid MediaWiki dump namespace'. format(namespace)) page_tag = '{%s}page' % namespace ns_path = './{%s}ns' % namespace page_id_path = './{%s}id' % namespace title_path = './{%s}title' % namespace text_path = './{%s}revision/{%s}text' % (namespace, namespace) for elem in elems: if elem.tag == page_tag: page_id = elem.find(page_id_path).text title = elem.find(title_path).text ns = elem.find(ns_path).text if ns != '0': content = '' else: content = elem.find(text_path).text if content is None: content = '' elif not isinstance(content, compat.unicode_): content = compat.bytes_to_unicode(content, errors='ignore') yield page_id, title, content elem.clear()
def __iter__(self): """ Iterate over the pages of a Wikipedia articles database dump (*articles.xml.bz2), yielding one (page id, page title, page content) 3-tuple at a time. Yields: Tuple[str, str, str]: page id, title, content with wikimedia markup """ if PY2 is False: events = ('end',) f = open_sesame(self.path, mode='rt') else: # Python 2 can't open bzip in text mode :( events = (b'end',) f = open_sesame(self.path, mode='rb') with f: elems = (elem for _, elem in iterparse(f, events=events)) elem = next(elems) match = re.match('^{(.*?)}', elem.tag) namespace = match.group(1) if match else '' if not namespace.startswith('http://www.mediawiki.org/xml/export-'): raise ValueError( 'namespace "{}" not a valid MediaWiki dump namespace'.format(namespace)) page_tag = '{%s}page' % namespace ns_path = './{%s}ns' % namespace page_id_path = './{%s}id' % namespace title_path = './{%s}title' % namespace text_path = './{%s}revision/{%s}text' % (namespace, namespace) for elem in elems: if elem.tag == page_tag: page_id = elem.find(page_id_path).text title = elem.find(title_path).text ns = elem.find(ns_path).text if ns != '0': content = '' else: content = elem.find(text_path).text if not isinstance(content, unicode_type): content = bytes_to_unicode(content, errors='ignore') yield page_id, title, content elem.clear()