def parse_translation_table(self, table):
        """ Overrides GeneralParser's method.
        :param table: a Tag object. Not necessary a table; can be a div.
        :return: (translation, language_name, language_code)
        """

        # go through all "li" elements in a table
        for li in table.find_all('li'):
            if not isinstance(li, Tag):
                continue
            text = li.get_text().split(':')
            if len(text) < 2:
                continue

            # language name is before ":"
            lang_name = text[0]

            # language code is usually in super script
            lang_code = li.find(class_="trad-sup-code")
            if lang_code:
                lang_code = lang_code.text.strip()[1:-1]
            else:
                lang_code = ""

            # There are two functions that removes parentheses. Not sure which one to use.
            t = remove_parenthesis(text[1])
            trans_list = re.split(COMMA_OR_SEMICOLON, t)
                # each "trans" is: translation <sup>(lang_code)</sup> (transliteration)
                # lang_code and transliteration may not exist
            for trans in trans_list:
                translation = trans.split('(')[0].strip()
                yield (translation, lang_name.strip(), lang_code)
Ejemplo n.º 2
0
    def parse_translation_table(self, table):
        """ Overrides GeneralParser's method.
        :param table: a Tag object. Not necessary a table; can be a div.
        :return: (translation, language_name, language_code)
        """

        # go through all "li" elements in a table
        for li in table.find_all('li'):
            if not isinstance(li, Tag):
                continue
            text = li.get_text().split(':')
            if len(text) < 2:
                continue

            # language name is before ":"
            lang_name = text[0]

            # language code is usually in super script
            lang_code = li.find(class_="trad-existe")
            if lang_code:
                lang_code = lang_code.text.strip()[1:-1]
            else:
                lang_code = ""

            # There are two functions that removes parentheses. Not sure which one to use.
            t = remove_parenthesis(text[1])
            trans_list = re.split(COMMA_OR_SEMICOLON, t)
            # each "trans" is: translation <sup>(lang_code)</sup> (transliteration)
            # lang_code and transliteration may not exist
            for trans in trans_list:
                translation = trans.split('(')[0].strip()
                yield (translation, lang_name.strip(), lang_code)
Ejemplo n.º 3
0
    def generate_translation_tuples(self, soup):
        """
        A generator of translation tuples
        :param soup: BeautifulSoup object
        :return: tuple of the form (edition, headword, head_lang, translation, trans_lang, trans_lang_code, part_of_speech)
        """

        # START non-edition-specific
        # this is the table of content which is present in each edition
        toc = soup.find('div', id='mw-content-text')

        page_state = {'headword': None,
                      'headword_lang': None,
                      'part_of_speech': ''}
        pronounce = ''
        head = soup.find('h1', id='firstHeading', class_='firstHeading')
        if head:
            page_state['headword'] = head.text

        for element in toc.children:
            if isinstance(element, Tag):  # it could be a Tag or a NavigableString
                level = self.get_heading_level(element.name)
                # END non-edition-specific
                # Find the headword language

                if 'id' in element.attrs and element.attrs['id'] == 'toc' and 'class' in element.attrs and \
                                'toccolours' in element.attrs['class']:

                    if element.b is not None:
                        page_state['headword_lang'] = remove_parenthesis(element.b.text).strip()

                        # Find Part of Speech: Not sure if this works. The only way i've been able to see a correlation between
                        # All pages for part of speech is by it being a h2 and the POS in a font tag. Since my sample test is so small
                        # I don't know if it's working properly

                if level == 2:
                    if element.font is not None:
                        page_state['part_of_speech'] = element.font.text


                # Find the translation table
                elif element.name == 'ul':

                    for translation, lang, lang_code in self.parse_translation_table(element):

                        yield (
                            self.edition, page_state['headword'], page_state['headword_lang'], translation, lang,
                            lang_code, page_state['part_of_speech'], pronounce)
                    translation_table = False
    def generate_translation_tuples(self, soup):
        """
        A generator of translation tuples
        :param soup: BeautifulSoup object
        :return: tuple of the form (edition, headword, head_lang, translation, trans_lang, trans_lang_code, part_of_speech)
        """

        # START non-edition-specific
        # this is the table of content which is present in each edition
        toc = soup.find('div', id='mw-content-text')

        page_state = {'headword': None,
                      'headword_lang': None,
                      'part_of_speech': ''}
        pronounce = ''
        head = soup.find('h1', id='titleHeading')
        if head is not None:
            page_state['headword'] = head.text

        for element in toc.children:
            if isinstance(element, Tag):  # it could be a Tag or a NavigableString
                level = self.get_heading_level(element.name)
                # END non-edition-specific
                # Find the headword language

                if level == 1:

                    if element.big is not None:

                        page_state['headword_lang'] = remove_parenthesis(element.b.text).strip()

                        # Find Part of Speech: Not sure if this works. The only way i've been able to see a correlation between
                        # All pages for part of speech is by it being a h2 and the POS in a font tag. Since my sample test is so small
                        # I don't know if it's working properly

                if level == 2:
                    if element.text is not None:
                        page_state['part_of_speech'] = element.text.strip()


                # Find the translation table
                elif element.name == 'ul':
                    
                    for translation, lang, lang_code in self.parse_translation_table(element):
                        yield (
                            self.edition, page_state['headword'], page_state['headword_lang'], translation, lang,
                            lang_code, page_state['part_of_speech'], pronounce)
                    translation_table = False