def __init__(self, output_dir, nfiles=20): self.nfiles = nfiles self.writers = [ MsgpackStreamWriter(output_dir + '/wiktionary_%02d.msgpack' % i) for i in range(nfiles) ] self.title_db = TitleDBWriter(output_dir + '/titles.db', clear=True)
class WiktionaryWriter(object): """ Parses a wiktionary file in XML format and saves the results to a set of files in msgpack format and a SQLite database. Subclasses most likely want to override the methods `_get_language()` and `handle_section()`. """ def __init__(self, output_dir, nfiles=20): self.nfiles = nfiles self.writers = [ MsgpackStreamWriter(output_dir + '/wiktionary_%02d.msgpack' % i) for i in range(nfiles) ] self.title_db = TitleDBWriter(output_dir + '/titles.db', clear=True) def _get_language_code(self, language): return NAME_TO_CODE['en'].get(language) def _get_language(self, heading): """Essentially a no-op method by default, but meant to be overridden with something more useful in subclasses.""" return heading def parse_wiktionary_file(self, filename): # Create a parser parser = make_parser() # Tell the parser we are not interested in XML namespaces parser.setFeature(feature_namespaces, 0) # Create the handler dh = ExtractPages(self.handle_page) # Tell the parser to use our handler parser.setContentHandler(dh) # Parse the input with self.title_db.transaction(): parser.parse(open(filename)) def handle_page(self, title, text, site): if ':' not in title: found = SECTION_HEADER_RES[2].split(text) headings = found[1::2] texts = found[2::2] for heading, text in zip(headings, texts): heading = fix_heading(heading) self.handle_language_section(site, title, heading, text) def handle_language_section(self, site, title, heading, text): sec_data = self.handle_section(text, heading, level=2) language = self._get_language(sec_data['heading']) data = { 'site': site, 'language': language, 'title': title, 'sections': sec_data['sections'] } filenum = hash((site, title, heading)) % self.nfiles self.writers[filenum].write(data) # Save the languages and titles to a database file language_code = self._get_language_code(language) if language_code is not None: self.title_db.add(language_code, title.lower()) def handle_section(self, text, heading, level): section_finder = SECTION_HEADER_RES[level + 1] found = section_finder.split(text) headings = found[1::2] texts = found[2::2] data = { 'heading': heading, 'text': found[0].strip(), 'sections': [self.handle_section(text2, heading2, level + 1) for (text2, heading2) in zip(texts, headings)] } return data def close(self): self.title_db.close()
class WiktionaryWriter(object): """ Parses a wiktionary file in XML format and saves the results to a set of files in msgpack format and a SQLite database. Subclasses most likely want to override the methods `_get_language()` and `handle_section()`. """ def __init__(self, output_dir, nfiles=20): self.nfiles = nfiles self.writers = [MsgpackStreamWriter(output_dir + "/wiktionary_%02d.msgpack" % i) for i in range(nfiles)] self.title_db = TitleDBWriter(output_dir + "/titles.db", clear=True) def _get_language_code(self, language): return NAME_TO_CODE["en"].get(language) def _get_language(self, heading): """Essentially a no-op method by default, but meant to be overridden with something more useful in subclasses.""" return heading def parse_wiktionary_file(self, filename): # Create a parser parser = make_parser() # Tell the parser we are not interested in XML namespaces parser.setFeature(feature_namespaces, 0) # Create the handler dh = ExtractPages(self.handle_page) # Tell the parser to use our handler parser.setContentHandler(dh) # Parse the input with self.title_db.transaction(): parser.parse(open(filename)) def handle_page(self, title, text, site): if ":" not in title: found = SECTION_HEADER_RES[2].split(text) headings = found[1::2] texts = found[2::2] for heading, text in zip(headings, texts): heading = fix_heading(heading) self.handle_language_section(site, title, heading, text) def handle_language_section(self, site, title, heading, text): sec_data = self.handle_section(text, heading, level=2) language = self._get_language(sec_data["heading"]) data = {"site": site, "language": language, "title": title, "sections": sec_data["sections"]} filenum = hash((site, title, heading)) % self.nfiles self.writers[filenum].write(data) # Save the languages and titles to a database file language_code = self._get_language_code(language) if language_code is not None: self.title_db.add(language_code, title.lower()) def handle_section(self, text, heading, level): section_finder = SECTION_HEADER_RES[level + 1] found = section_finder.split(text) headings = found[1::2] texts = found[2::2] data = { "heading": heading, "text": found[0].strip(), "sections": [self.handle_section(text2, heading2, level + 1) for (text2, heading2) in zip(texts, headings)], } return data def close(self): self.title_db.close()
class WiktionaryWriter(object): """ Parses a wiktionary file in XML format and saves the results to a set of files in msgpack format and a SQLite database. Subclasses most likely want to override the methods `_get_language_code()` and `handle_section()`. """ def __init__(self, output_dir, nfiles=20): self.nfiles = nfiles self.writers = [ MsgpackStreamWriter(output_dir + '/wiktionary_%02d.msgpack' % i) for i in range(nfiles) ] self.title_db = TitleDBWriter(output_dir + '/titles.db', clear=True) def _get_language_code(self, language): return _language_name_to_code(language, 'en') def parse_wiktionary_file(self, filename): # Create a parser parser = make_parser() # Tell the parser we are not interested in XML namespaces parser.setFeature(feature_namespaces, 0) # Create the handler dh = ExtractPages(self.handle_page) # Tell the parser to use our handler parser.setContentHandler(dh) # Parse the input with self.title_db.transaction(): parser.parse(open(filename)) def handle_page(self, title, text, site): if ':' not in title: found = SECTION_HEADER_RES[2].split(text) headings = found[1::2] texts = found[2::2] for heading, text in zip(headings, texts): heading = fix_heading(heading) self.handle_language_section(site, title, heading, text) def handle_language_section(self, site, title, heading, text): sec_data = self.handle_section(text, heading, level=2) language = self._get_language_code(sec_data['heading']) if language is None: return data = { 'site': site, 'language': language, 'title': title, 'sections': sec_data['sections'] } filenum = hash((site, title, heading)) % self.nfiles self.writers[filenum].write(data) # Save the languages and titles to a database file self.title_db.add(language, title.lower()) def handle_section(self, text, heading, level): section_finder = SECTION_HEADER_RES[level + 1] found = section_finder.split(text) headings = found[1::2] texts = found[2::2] data = { 'heading': heading, 'text': found[0].strip(), 'sections': [self.handle_section(text2, heading2, level + 1) for (text2, heading2) in zip(texts, headings)] } return data def close(self): self.title_db.close()