def parse_example(example_soup): logging.debug('Start parsing examples') example_part_root = etree.Element('Examples') example_elements = example_soup.find_all( re.compile('dd|li'), recursive=False) logger.debug('Found {} example elements'.format(len(example_elements))) for i, example in enumerate(example_elements): logger.debug('Parsing example {}'.format(i)) example_root = etree.Element('Example') example_part_root.append(example_root) example_translation = example.find('dl') try: example_translation_text = example_translation.text except AttributeError: # Example is placed as a quotation insted of a standard example logger.debug('Example placed as a quotation') example_text = example.text else: example_translation_text_clean = util.clean_text( example_translation_text) # Remove translation to avoid having it show up in the example text example_translation.clear() example_text = example.text example_translation_element = etree.Element('Translation') example_translation_element.text = example_translation_text_clean example_root.append(example_translation_element) example_text_clean = util.clean_text(example_text) example_text_element = etree.Element('Text') example_text_element.text = example_text_clean example_root.append(example_text_element) example_soup.clear() logging.debug('Finished parsing examples') return example_part_root
def parse_example(example_soup): logging.debug('Start parsing examples') example_part_root = etree.Element('Examples') example_elements = example_soup.find_all(re.compile('dd|li'), recursive=False) logger.debug('Found {} example elements'.format(len(example_elements))) for i, example in enumerate(example_elements): logger.debug('Parsing example {}'.format(i)) example_root = etree.Element('Example') example_part_root.append(example_root) example_translation = example.find('dl') try: example_translation_text = example_translation.text except AttributeError: # Example is placed as a quotation insted of a standard example logger.debug('Example placed as a quotation') example_text = example.text else: example_translation_text_clean = util.clean_text( example_translation_text) # Remove translation to avoid having it show up in the example text example_translation.clear() example_text = example.text example_translation_element = etree.Element('Translation') example_translation_element.text = example_translation_text_clean example_root.append(example_translation_element) example_text_clean = util.clean_text(example_text) example_text_element = etree.Element('Text') example_text_element.text = example_text_clean example_root.append(example_text_element) example_soup.clear() logging.debug('Finished parsing examples') return example_part_root
def parse_pronoun_table_row(row): row_elements = row.find_all('td') case_name = util.clean_text(row_elements[0].text) singular = util.clean_text(row_elements[1].text) plural = util.clean_text(row_elements[2].text) case_element = etree.Element(case_name) singular_element = etree.SubElement(case_element, 'singular') singular_element.text = util.clean_text(singular) plural_element = etree.SubElement(case_element, 'plural') plural_element.text = util.clean_text(plural) return case_element
def _extract_fourth_infinitives(table_rows, row_id, infinitives_element): logger.debug('Extracting the fourth infinitives') fourth_infinitive_element = etree.SubElement(infinitives_element, 'fourth') for i, row in enumerate(table_rows[row_id: row_id + 2]): cell_values = row.find_all('td') headlines = row.find_all('th') if i == 0: # First row is special since it also contains the title row for the infinitive headlines = list(headlines[1:]) name = util.clean_text(headlines[0].text) infinitive = etree.SubElement(fourth_infinitive_element, name) text = cell_values[0].text infinitive.text = util.clean_text(text)
def _extract_fifth_infinitives(table_rows, row_id, infinitives_element): logger.debug('Extracting the fifth infinitives') element = etree.SubElement(infinitives_element, 'fifth') row = table_rows[row_id] cell_values = row.find_all('td') text = cell_values[0].text element.text = util.clean_text(text)
def find_noun_table_start(rows): """ The noun table has to parts. The first part contains nominative, genitive, partitive and illative. After these four cases the main table begins and those four cases are repeated there but not in the same other. Because of this we want to ignore the first lines and first start parsing on the main table. Rows: The rows of the table Returns: the id of tgen.he row where the first entry of the main table exists. (After the table headers) """ logger.debug('Starting search for the main table') for i, row in enumerate(rows[1:]): noun_case_name = row.th.text noun_case_name = util.clean_text(noun_case_name) logger.debug(noun_case_name) try: etree.Element(noun_case_name) except ValueError as err: if str(err) == 'Empty tag name': # We found the headers of the real table logger.debug('Found the table headers in row {}'.format(i + 1)) return i + 2 else: raise raise ValueError("Couldn't find the start of the main table")
def parse_noun_table(rows): logger.debug('Starting noun table parsing') table_root = etree.Element('table') in_accusative = False noun_case_element = None start_row = find_noun_table_start(rows) for i, row in enumerate(rows[start_row:]): logger.debug('Parsing row {}'.format(i + start_row)) if in_accusative: logger.debug('Entering second accusative row (genitive)') parse_second_accusative_row(noun_case_element, row) in_accusative = False else: noun_case_name = row.th.text noun_case_name = util.clean_text(noun_case_name) logger.debug('Creating new noun case element: {}'.format( noun_case_name)) noun_case_element = etree.Element(noun_case_name) table_root.append(noun_case_element) if noun_case_name == 'accusative': logger.debug('Found the accusative case') in_accusative = True noun_case_element = etree.SubElement( noun_case_element, 'nominative') parse_noun_table_row( row, noun_case_element, noun_case_name) logger.debug('Finished noun table parsing') return table_root
def parse_noun_table_row(row, noun_case_element, noun_case_name): """Extracts the singular and plural form from the table row""" row_elements = row.find_all('td') singular = row_elements[0].text if noun_case_name == 'genitive': logger.debug('Entering genitive case') try: plural = util.clean_text(row_elements[1].find('span').text) except AttributeError: # Not all words has plural forms of the genitive cases plural = '—' else: plural = util.clean_text(row_elements[1].text) singular_element = etree.SubElement(noun_case_element, 'singular') singular_element.text = util.clean_text(singular) plural_element = etree.SubElement(noun_case_element, 'plural') plural_element.text = util.clean_text(plural)
def parse_meta_information(headline_row): logger.debug('Starting extracting meta info from table') headline_element = headline_row.th headline_text = util.clean_text(headline_element.text) logger.debug('Headline text: {}'.format(headline_text.replace('\n', ''))) word, kotus_type, kotus_word, gradation = extract_meta_information(headline_text) meta_element = create_meta_tree(word, kotus_type, kotus_word, gradation) logger.debug('Finished extracting meta info from table') return meta_element
def _clean_verb_table_titles(text): """ Connects all words in the title with underscore so they can be used as keys. """ clean_title = util.clean_text(text) if 'tense' in clean_title: clean_title = clean_title.split()[0] else: clean_title = '_'.join(clean_title.split()) # For some reason a simple str.replace didn't work return clean_title
def _extract_third_infinitives(table_rows, row_id, infinitives_element): logger.debug('Extracting the third infinitives') third_infinitive_element = etree.SubElement(infinitives_element, 'third') for i, row in enumerate(table_rows[row_id: row_id + 6]): cell_values = row.find_all('td') headlines = row.find_all('th') if i == 0: # First row is special since it also contains the title row for the infinitive headlines = list(headlines[1:]) cell_values = cell_values[:-1] name = util.clean_text(headlines[0].text) infinitive = etree.SubElement(third_infinitive_element, name) _extract_active_and_passive_forms(cell_values, infinitive, offset=0)
def parse_translation(translation_soup): logger.debug('Parsing translation part') root = etree.Element('Translation') example_part = translation_soup.find(re.compile('dl|ul')) if example_part: example_part_root = parse_example(example_part) root.append(example_part_root) text = translation_soup.text text_clean = util.clean_text(text) text_element = etree.Element('Text') text_element.text = text_clean root.append(text_element) logger.debug('Finished parsing translation part') return root
def parse_translation(translation_soup): logger.debug('Parsing translation part') root = etree.Element('Translation') example_part = translation_soup.find(re.compile('dl|ul')) if example_part: example_part_root = parse_example(example_part) root.append(example_part_root) text = translation_soup.text text_clean = util.clean_text(text) text_element = etree.Element('Text') text_element.text = text_clean root.append(text_element) logger.debug('Finished parsing translation part') return root
def _extract_first_two_nominal_form_lines( table_rows, row_id, infinitives_element, participles_element): names = [ ['first', 'present'], ['long_first', 'past'] ] logger.debug('Extracting first two lines of the nominal forms') for i, row in enumerate(table_rows[row_id: row_id + 2]): cell_values = row.find_all('td') infinitive = etree.SubElement(infinitives_element, names[i][0]) infinitive.text = util.clean_text(cell_values[0].text) participle_element = etree.SubElement(participles_element, names[i][1]) _extract_active_and_passive_forms(cell_values, participle_element)
def _extract_nominal_form_lines_3_to_4( table_rows, row_id, infinitives_element, participles_element): second_infinitive_element = etree.SubElement(infinitives_element, 'second') names = [ ['inessive', 'instructive'], ['agent', 'negative'] ] logger.debug('Extracting third and fourth lines of the nominal forms') for i, row in enumerate(table_rows[row_id: row_id + 2]): cell_values = row.find_all('td') infinitive = etree.SubElement(second_infinitive_element, names[0][i]) _extract_active_and_passive_forms(cell_values, infinitive, offset=0) participle_element = etree.SubElement(participles_element, names[1][i]) participle_element.text = util.clean_text(cell_values[2].text)
def _extract_active_and_passive_forms(cell_values, root_element, offset=1): logger.debug('Extracting active and passive forms') times = ['active', 'passive'] for i, time in enumerate(times): element = etree.SubElement(root_element, time) element.text = util.clean_text(cell_values[i + offset].text)
def parse_second_accusative_row(noun_case_element, row): noun_case_element = noun_case_element.getparent() noun_case_element = etree.SubElement(noun_case_element, 'genitive') noun_case_element.text = util.clean_text(row.find('td').text)