Python clean_textの例、susaki.wiktionary.wiki_parsing.util.clean_text Pythonの例

コード例 #1

0

ファイルを表示

ファイル: article_parsing.py プロジェクト: SimonTC/SuSaKi

def parse_example(example_soup):
    logging.debug('Start parsing examples')
    example_part_root = etree.Element('Examples')
    example_elements = example_soup.find_all(
        re.compile('dd|li'), recursive=False)
    logger.debug('Found {} example elements'.format(len(example_elements)))
    for i, example in enumerate(example_elements):
        logger.debug('Parsing example {}'.format(i))
        example_root = etree.Element('Example')
        example_part_root.append(example_root)
        example_translation = example.find('dl')
        try:
            example_translation_text = example_translation.text
        except AttributeError:
            # Example is placed as a quotation insted of a standard example
            logger.debug('Example placed as a quotation')
            example_text = example.text
        else:
            example_translation_text_clean = util.clean_text(
                example_translation_text)
            # Remove translation to avoid having it show up in the example text
            example_translation.clear()
            example_text = example.text
            example_translation_element = etree.Element('Translation')
            example_translation_element.text = example_translation_text_clean
            example_root.append(example_translation_element)

        example_text_clean = util.clean_text(example_text)
        example_text_element = etree.Element('Text')
        example_text_element.text = example_text_clean
        example_root.append(example_text_element)

    example_soup.clear()
    logging.debug('Finished parsing examples')
    return example_part_root

コード例 #2

0

ファイルを表示

ファイル: article_parsing.py プロジェクト: SimonTC/SuSaKi

def parse_example(example_soup):
    logging.debug('Start parsing examples')
    example_part_root = etree.Element('Examples')
    example_elements = example_soup.find_all(re.compile('dd|li'),
                                             recursive=False)
    logger.debug('Found {} example elements'.format(len(example_elements)))
    for i, example in enumerate(example_elements):
        logger.debug('Parsing example {}'.format(i))
        example_root = etree.Element('Example')
        example_part_root.append(example_root)
        example_translation = example.find('dl')
        try:
            example_translation_text = example_translation.text
        except AttributeError:
            # Example is placed as a quotation insted of a standard example
            logger.debug('Example placed as a quotation')
            example_text = example.text
        else:
            example_translation_text_clean = util.clean_text(
                example_translation_text)
            # Remove translation to avoid having it show up in the example text
            example_translation.clear()
            example_text = example.text
            example_translation_element = etree.Element('Translation')
            example_translation_element.text = example_translation_text_clean
            example_root.append(example_translation_element)

        example_text_clean = util.clean_text(example_text)
        example_text_element = etree.Element('Text')
        example_text_element.text = example_text_clean
        example_root.append(example_text_element)

    example_soup.clear()
    logging.debug('Finished parsing examples')
    return example_part_root

コード例 #3

0

ファイルを表示

def parse_pronoun_table_row(row):
    row_elements = row.find_all('td')
    case_name = util.clean_text(row_elements[0].text)
    singular = util.clean_text(row_elements[1].text)
    plural = util.clean_text(row_elements[2].text)
    case_element = etree.Element(case_name)
    singular_element = etree.SubElement(case_element, 'singular')
    singular_element.text = util.clean_text(singular)
    plural_element = etree.SubElement(case_element, 'plural')
    plural_element.text = util.clean_text(plural)
    return case_element

コード例 #4

0

ファイルを表示

def _extract_fourth_infinitives(table_rows, row_id, infinitives_element):
    logger.debug('Extracting the fourth infinitives')
    fourth_infinitive_element = etree.SubElement(infinitives_element, 'fourth')
    for i, row in enumerate(table_rows[row_id: row_id + 2]):
        cell_values = row.find_all('td')
        headlines = row.find_all('th')
        if i == 0:
            # First row is special since it also contains the title row for the infinitive
            headlines = list(headlines[1:])
        name = util.clean_text(headlines[0].text)
        infinitive = etree.SubElement(fourth_infinitive_element, name)
        text = cell_values[0].text
        infinitive.text = util.clean_text(text)

コード例 #5

0

ファイルを表示

def _extract_fifth_infinitives(table_rows, row_id, infinitives_element):
    logger.debug('Extracting the fifth infinitives')
    element = etree.SubElement(infinitives_element, 'fifth')
    row = table_rows[row_id]
    cell_values = row.find_all('td')
    text = cell_values[0].text
    element.text = util.clean_text(text)

コード例 #6

0

ファイルを表示

def find_noun_table_start(rows):
    """ The noun table has to parts. The first part contains nominative,
    genitive, partitive and illative.
    After these four cases the main table begins and those four cases
    are repeated there but not in the same other. Because of this we want to
    ignore the first lines and first start parsing on the main table.
    Rows: The rows of the table
    Returns: the id of tgen.he row where the first entry of the main table exists.
             (After the table headers)
    """
    logger.debug('Starting search for the main table')
    for i, row in enumerate(rows[1:]):
        noun_case_name = row.th.text
        noun_case_name = util.clean_text(noun_case_name)
        logger.debug(noun_case_name)
        try:
            etree.Element(noun_case_name)
        except ValueError as err:
            if str(err) == 'Empty tag name':
                # We found the headers of the real table
                logger.debug('Found the table headers in row {}'.format(i + 1))
                return i + 2
            else:
                raise
    raise ValueError("Couldn't find the start of the main table")

コード例 #7

0

ファイルを表示

def parse_noun_table(rows):
    logger.debug('Starting noun table parsing')
    table_root = etree.Element('table')
    in_accusative = False
    noun_case_element = None

    start_row = find_noun_table_start(rows)

    for i, row in enumerate(rows[start_row:]):
        logger.debug('Parsing row {}'.format(i + start_row))
        if in_accusative:
            logger.debug('Entering second accusative row (genitive)')
            parse_second_accusative_row(noun_case_element, row)
            in_accusative = False
        else:
            noun_case_name = row.th.text
            noun_case_name = util.clean_text(noun_case_name)
            logger.debug('Creating new noun case element: {}'.format(
                noun_case_name))
            noun_case_element = etree.Element(noun_case_name)
            table_root.append(noun_case_element)
            if noun_case_name == 'accusative':
                logger.debug('Found the accusative case')
                in_accusative = True
                noun_case_element = etree.SubElement(
                    noun_case_element, 'nominative')
            parse_noun_table_row(
                row, noun_case_element, noun_case_name)
    logger.debug('Finished noun table parsing')
    return table_root

コード例 #8

0

ファイルを表示

def parse_noun_table_row(row, noun_case_element, noun_case_name):
    """Extracts the singular and plural form from the table row"""
    row_elements = row.find_all('td')
    singular = row_elements[0].text
    if noun_case_name == 'genitive':
        logger.debug('Entering genitive case')
        try:
            plural = util.clean_text(row_elements[1].find('span').text)
        except AttributeError:
            # Not all words has plural forms of the genitive cases
            plural = '—'
    else:
        plural = util.clean_text(row_elements[1].text)
    singular_element = etree.SubElement(noun_case_element, 'singular')
    singular_element.text = util.clean_text(singular)
    plural_element = etree.SubElement(noun_case_element, 'plural')
    plural_element.text = util.clean_text(plural)

コード例 #9

0

ファイルを表示

def parse_meta_information(headline_row):
    logger.debug('Starting extracting meta info from table')
    headline_element = headline_row.th
    headline_text = util.clean_text(headline_element.text)
    logger.debug('Headline text: {}'.format(headline_text.replace('\n', '')))
    word, kotus_type, kotus_word, gradation = extract_meta_information(headline_text)
    meta_element = create_meta_tree(word, kotus_type, kotus_word, gradation)
    logger.debug('Finished extracting meta info from table')
    return meta_element

コード例 #10

0

ファイルを表示

def _clean_verb_table_titles(text):
    """ Connects all words in the title with underscore so they can be used as
    keys.
    """
    clean_title = util.clean_text(text)
    if 'tense' in clean_title:
        clean_title = clean_title.split()[0]
    else:
        clean_title = '_'.join(clean_title.split())  # For some reason a simple str.replace didn't work
    return clean_title

コード例 #11

0

ファイルを表示

def _extract_third_infinitives(table_rows, row_id, infinitives_element):
    logger.debug('Extracting the third infinitives')
    third_infinitive_element = etree.SubElement(infinitives_element, 'third')
    for i, row in enumerate(table_rows[row_id: row_id + 6]):
        cell_values = row.find_all('td')
        headlines = row.find_all('th')
        if i == 0:
            # First row is special since it also contains the title row for the infinitive
            headlines = list(headlines[1:])
            cell_values = cell_values[:-1]
        name = util.clean_text(headlines[0].text)
        infinitive = etree.SubElement(third_infinitive_element, name)
        _extract_active_and_passive_forms(cell_values, infinitive, offset=0)

コード例 #12

0

ファイルを表示

ファイル: article_parsing.py プロジェクト: SimonTC/SuSaKi

def parse_translation(translation_soup):
    logger.debug('Parsing translation part')
    root = etree.Element('Translation')
    example_part = translation_soup.find(re.compile('dl|ul'))
    if example_part:
        example_part_root = parse_example(example_part)
        root.append(example_part_root)
    text = translation_soup.text
    text_clean = util.clean_text(text)
    text_element = etree.Element('Text')
    text_element.text = text_clean
    root.append(text_element)
    logger.debug('Finished parsing translation part')
    return root

コード例 #13

0

ファイルを表示

ファイル: article_parsing.py プロジェクト: SimonTC/SuSaKi

def parse_translation(translation_soup):
    logger.debug('Parsing translation part')
    root = etree.Element('Translation')
    example_part = translation_soup.find(re.compile('dl|ul'))
    if example_part:
        example_part_root = parse_example(example_part)
        root.append(example_part_root)
    text = translation_soup.text
    text_clean = util.clean_text(text)
    text_element = etree.Element('Text')
    text_element.text = text_clean
    root.append(text_element)
    logger.debug('Finished parsing translation part')
    return root

コード例 #14

0

ファイルを表示

def _extract_first_two_nominal_form_lines(
        table_rows, row_id, infinitives_element, participles_element):
    names = [
        ['first', 'present'],
        ['long_first', 'past']
    ]
    logger.debug('Extracting first two lines of the nominal forms')
    for i, row in enumerate(table_rows[row_id: row_id + 2]):
        cell_values = row.find_all('td')

        infinitive = etree.SubElement(infinitives_element, names[i][0])
        infinitive.text = util.clean_text(cell_values[0].text)

        participle_element = etree.SubElement(participles_element, names[i][1])
        _extract_active_and_passive_forms(cell_values, participle_element)

コード例 #15

0

ファイルを表示

def _extract_nominal_form_lines_3_to_4(
        table_rows, row_id, infinitives_element, participles_element):
    second_infinitive_element = etree.SubElement(infinitives_element, 'second')
    names = [
        ['inessive', 'instructive'],
        ['agent', 'negative']
    ]
    logger.debug('Extracting third and fourth lines of the nominal forms')
    for i, row in enumerate(table_rows[row_id: row_id + 2]):
        cell_values = row.find_all('td')

        infinitive = etree.SubElement(second_infinitive_element, names[0][i])
        _extract_active_and_passive_forms(cell_values, infinitive, offset=0)

        participle_element = etree.SubElement(participles_element, names[1][i])
        participle_element.text = util.clean_text(cell_values[2].text)

コード例 #16

0

ファイルを表示

def _extract_active_and_passive_forms(cell_values, root_element, offset=1):
    logger.debug('Extracting active and passive forms')
    times = ['active', 'passive']
    for i, time in enumerate(times):
        element = etree.SubElement(root_element, time)
        element.text = util.clean_text(cell_values[i + offset].text)

コード例 #17

0

ファイルを表示

def parse_second_accusative_row(noun_case_element, row):
    noun_case_element = noun_case_element.getparent()
    noun_case_element = etree.SubElement(noun_case_element, 'genitive')
    noun_case_element.text = util.clean_text(row.find('td').text)