Example #1
0
 def __init__(self, output_dir, nfiles=20):
     self.nfiles = nfiles
     self.writers = [
         MsgpackStreamWriter(output_dir + '/wiktionary_%02d.msgpack' % i)
         for i in range(nfiles)
     ]
     self.title_db = TitleDBWriter(output_dir + '/titles.db', clear=True)
Example #2
0
 def __init__(self, output_dir, nfiles=20):
     self.nfiles = nfiles
     self.writers = [
         MsgpackStreamWriter(output_dir + '/wiktionary_%02d.msgpack' % i)
         for i in range(nfiles)
     ]
     self.title_db = TitleDBWriter(output_dir + '/titles.db', clear=True)
Example #3
0
class WiktionaryWriter(object):
    """
    Parses a wiktionary file in XML format and saves the results to a set of
    files in msgpack format and a SQLite database.

    Subclasses most likely want to override the methods `_get_language()` and
    `handle_section()`.
    """
    def __init__(self, output_dir, nfiles=20):
        self.nfiles = nfiles
        self.writers = [
            MsgpackStreamWriter(output_dir + '/wiktionary_%02d.msgpack' % i)
            for i in range(nfiles)
        ]
        self.title_db = TitleDBWriter(output_dir + '/titles.db', clear=True)

    def _get_language_code(self, language):
        return NAME_TO_CODE['en'].get(language)

    def _get_language(self, heading):
        """Essentially a no-op method by default, but meant to be overridden
        with something more useful in subclasses."""
        return heading

    def parse_wiktionary_file(self, filename):
        # Create a parser
        parser = make_parser()

        # Tell the parser we are not interested in XML namespaces
        parser.setFeature(feature_namespaces, 0)

        # Create the handler
        dh = ExtractPages(self.handle_page)

        # Tell the parser to use our handler
        parser.setContentHandler(dh)

        # Parse the input
        with self.title_db.transaction():
            parser.parse(open(filename))

    def handle_page(self, title, text, site):
        if ':' not in title:
            found = SECTION_HEADER_RES[2].split(text)
            headings = found[1::2]
            texts = found[2::2]
            for heading, text in zip(headings, texts):
                heading = fix_heading(heading)
                self.handle_language_section(site, title, heading, text)

    def handle_language_section(self, site, title, heading, text):
        sec_data = self.handle_section(text, heading, level=2)
        language = self._get_language(sec_data['heading'])
        data = {
            'site': site,
            'language': language,
            'title': title,
            'sections': sec_data['sections']
        }
        filenum = hash((site, title, heading)) % self.nfiles
        self.writers[filenum].write(data)

        # Save the languages and titles to a database file
        language_code = self._get_language_code(language)
        if language_code is not None:
            self.title_db.add(language_code, title.lower())

    def handle_section(self, text, heading, level):
        section_finder = SECTION_HEADER_RES[level + 1]
        found = section_finder.split(text)
        headings = found[1::2]
        texts = found[2::2]
        data = {
            'heading': heading,
            'text': found[0].strip(),
            'sections': [self.handle_section(text2, heading2, level + 1)
                         for (text2, heading2) in zip(texts, headings)]
        }
        return data

    def close(self):
        self.title_db.close()
class WiktionaryWriter(object):
    """
    Parses a wiktionary file in XML format and saves the results to a set of
    files in msgpack format and a SQLite database.

    Subclasses most likely want to override the methods `_get_language()` and
    `handle_section()`.
    """

    def __init__(self, output_dir, nfiles=20):
        self.nfiles = nfiles
        self.writers = [MsgpackStreamWriter(output_dir + "/wiktionary_%02d.msgpack" % i) for i in range(nfiles)]
        self.title_db = TitleDBWriter(output_dir + "/titles.db", clear=True)

    def _get_language_code(self, language):
        return NAME_TO_CODE["en"].get(language)

    def _get_language(self, heading):
        """Essentially a no-op method by default, but meant to be overridden
        with something more useful in subclasses."""
        return heading

    def parse_wiktionary_file(self, filename):
        # Create a parser
        parser = make_parser()

        # Tell the parser we are not interested in XML namespaces
        parser.setFeature(feature_namespaces, 0)

        # Create the handler
        dh = ExtractPages(self.handle_page)

        # Tell the parser to use our handler
        parser.setContentHandler(dh)

        # Parse the input
        with self.title_db.transaction():
            parser.parse(open(filename))

    def handle_page(self, title, text, site):
        if ":" not in title:
            found = SECTION_HEADER_RES[2].split(text)
            headings = found[1::2]
            texts = found[2::2]
            for heading, text in zip(headings, texts):
                heading = fix_heading(heading)
                self.handle_language_section(site, title, heading, text)

    def handle_language_section(self, site, title, heading, text):
        sec_data = self.handle_section(text, heading, level=2)
        language = self._get_language(sec_data["heading"])
        data = {"site": site, "language": language, "title": title, "sections": sec_data["sections"]}
        filenum = hash((site, title, heading)) % self.nfiles
        self.writers[filenum].write(data)

        # Save the languages and titles to a database file
        language_code = self._get_language_code(language)
        if language_code is not None:
            self.title_db.add(language_code, title.lower())

    def handle_section(self, text, heading, level):
        section_finder = SECTION_HEADER_RES[level + 1]
        found = section_finder.split(text)
        headings = found[1::2]
        texts = found[2::2]
        data = {
            "heading": heading,
            "text": found[0].strip(),
            "sections": [self.handle_section(text2, heading2, level + 1) for (text2, heading2) in zip(texts, headings)],
        }
        return data

    def close(self):
        self.title_db.close()
Example #5
0
class WiktionaryWriter(object):
    """
    Parses a wiktionary file in XML format and saves the results to a set of
    files in msgpack format and a SQLite database.

    Subclasses most likely want to override the methods `_get_language_code()`
    and `handle_section()`.
    """
    def __init__(self, output_dir, nfiles=20):
        self.nfiles = nfiles
        self.writers = [
            MsgpackStreamWriter(output_dir + '/wiktionary_%02d.msgpack' % i)
            for i in range(nfiles)
        ]
        self.title_db = TitleDBWriter(output_dir + '/titles.db', clear=True)

    def _get_language_code(self, language):
        return _language_name_to_code(language, 'en')

    def parse_wiktionary_file(self, filename):
        # Create a parser
        parser = make_parser()

        # Tell the parser we are not interested in XML namespaces
        parser.setFeature(feature_namespaces, 0)

        # Create the handler
        dh = ExtractPages(self.handle_page)

        # Tell the parser to use our handler
        parser.setContentHandler(dh)

        # Parse the input
        with self.title_db.transaction():
            parser.parse(open(filename))

    def handle_page(self, title, text, site):
        if ':' not in title:
            found = SECTION_HEADER_RES[2].split(text)
            headings = found[1::2]
            texts = found[2::2]
            for heading, text in zip(headings, texts):
                heading = fix_heading(heading)
                self.handle_language_section(site, title, heading, text)

    def handle_language_section(self, site, title, heading, text):
        sec_data = self.handle_section(text, heading, level=2)
        language = self._get_language_code(sec_data['heading'])
        if language is None:
            return
        data = {
            'site': site,
            'language': language,
            'title': title,
            'sections': sec_data['sections']
        }
        filenum = hash((site, title, heading)) % self.nfiles
        self.writers[filenum].write(data)

        # Save the languages and titles to a database file
        self.title_db.add(language, title.lower())

    def handle_section(self, text, heading, level):
        section_finder = SECTION_HEADER_RES[level + 1]
        found = section_finder.split(text)
        headings = found[1::2]
        texts = found[2::2]
        data = {
            'heading': heading,
            'text': found[0].strip(),
            'sections': [self.handle_section(text2, heading2, level + 1)
                         for (text2, heading2) in zip(texts, headings)]
        }
        return data

    def close(self):
        self.title_db.close()