def parse_translation_table(self, table): """ Overrides GeneralParser's method. :param table: a Tag object. Not necessary a table; can be a div. :return: (translation, language_name, language_code) """ # go through all "li" elements in a table for li in table.find_all('li'): if not isinstance(li, Tag): continue text = li.get_text().split(':') if len(text) < 2: continue # language name is before ":" lang_name = text[0] # language code is usually in super script lang_code = li.find(class_="trad-sup-code") if lang_code: lang_code = lang_code.text.strip()[1:-1] else: lang_code = "" # There are two functions that removes parentheses. Not sure which one to use. t = remove_parenthesis(text[1]) trans_list = re.split(COMMA_OR_SEMICOLON, t) # each "trans" is: translation <sup>(lang_code)</sup> (transliteration) # lang_code and transliteration may not exist for trans in trans_list: translation = trans.split('(')[0].strip() yield (translation, lang_name.strip(), lang_code)
def parse_translation_table(self, table): """ Overrides GeneralParser's method. :param table: a Tag object. Not necessary a table; can be a div. :return: (translation, language_name, language_code) """ # go through all "li" elements in a table for li in table.find_all('li'): if not isinstance(li, Tag): continue text = li.get_text().split(':') if len(text) < 2: continue # language name is before ":" lang_name = text[0] # language code is usually in super script lang_code = li.find(class_="trad-existe") if lang_code: lang_code = lang_code.text.strip()[1:-1] else: lang_code = "" # There are two functions that removes parentheses. Not sure which one to use. t = remove_parenthesis(text[1]) trans_list = re.split(COMMA_OR_SEMICOLON, t) # each "trans" is: translation <sup>(lang_code)</sup> (transliteration) # lang_code and transliteration may not exist for trans in trans_list: translation = trans.split('(')[0].strip() yield (translation, lang_name.strip(), lang_code)
def generate_translation_tuples(self, soup): """ A generator of translation tuples :param soup: BeautifulSoup object :return: tuple of the form (edition, headword, head_lang, translation, trans_lang, trans_lang_code, part_of_speech) """ # START non-edition-specific # this is the table of content which is present in each edition toc = soup.find('div', id='mw-content-text') page_state = {'headword': None, 'headword_lang': None, 'part_of_speech': ''} pronounce = '' head = soup.find('h1', id='firstHeading', class_='firstHeading') if head: page_state['headword'] = head.text for element in toc.children: if isinstance(element, Tag): # it could be a Tag or a NavigableString level = self.get_heading_level(element.name) # END non-edition-specific # Find the headword language if 'id' in element.attrs and element.attrs['id'] == 'toc' and 'class' in element.attrs and \ 'toccolours' in element.attrs['class']: if element.b is not None: page_state['headword_lang'] = remove_parenthesis(element.b.text).strip() # Find Part of Speech: Not sure if this works. The only way i've been able to see a correlation between # All pages for part of speech is by it being a h2 and the POS in a font tag. Since my sample test is so small # I don't know if it's working properly if level == 2: if element.font is not None: page_state['part_of_speech'] = element.font.text # Find the translation table elif element.name == 'ul': for translation, lang, lang_code in self.parse_translation_table(element): yield ( self.edition, page_state['headword'], page_state['headword_lang'], translation, lang, lang_code, page_state['part_of_speech'], pronounce) translation_table = False
def generate_translation_tuples(self, soup): """ A generator of translation tuples :param soup: BeautifulSoup object :return: tuple of the form (edition, headword, head_lang, translation, trans_lang, trans_lang_code, part_of_speech) """ # START non-edition-specific # this is the table of content which is present in each edition toc = soup.find('div', id='mw-content-text') page_state = {'headword': None, 'headword_lang': None, 'part_of_speech': ''} pronounce = '' head = soup.find('h1', id='titleHeading') if head is not None: page_state['headword'] = head.text for element in toc.children: if isinstance(element, Tag): # it could be a Tag or a NavigableString level = self.get_heading_level(element.name) # END non-edition-specific # Find the headword language if level == 1: if element.big is not None: page_state['headword_lang'] = remove_parenthesis(element.b.text).strip() # Find Part of Speech: Not sure if this works. The only way i've been able to see a correlation between # All pages for part of speech is by it being a h2 and the POS in a font tag. Since my sample test is so small # I don't know if it's working properly if level == 2: if element.text is not None: page_state['part_of_speech'] = element.text.strip() # Find the translation table elif element.name == 'ul': for translation, lang, lang_code in self.parse_translation_table(element): yield ( self.edition, page_state['headword'], page_state['headword_lang'], translation, lang, lang_code, page_state['part_of_speech'], pronounce) translation_table = False