Ejemplo n.º 1
0
def search(word, exact=True, return_words=True):
    url = SEARCH_URL_FORM.format(word=word)
    response = requests.get(url)
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    main_sec = soup.find('section', id='block-duden-tiles-0')
    a_tags = [h2.a for h2 in main_sec.find_all('h2')]

    urlnames = [a['href'].split('/')[-1]
                for a in a_tags
                if (not exact) or clear_text(a.text) == word]
    if return_words:
        return [get(urlname) for urlname in urlnames]
    else:
        return urlnames
Ejemplo n.º 2
0
    def _table_node_to_tagged_cells(self, table_node):
        """
        Takes a table HTML node and returns the list of table cell strings
        tagged using the table top and left header (optionally using the table
        name found in the upper-leftmost cell).

        The return type is a list of 2-tuples:
        [(tag_set, text), ...]

        where text is a string taken from the cell, and tag_set is a set of
        strings (tags). If e.g. cell in the 3rd row and 2nd column with the
        text 'der Barmherzigkeit', has its top_header tag (1st row, 2nd
        column) 'Singular' and its left_header tag (1st column, 3rd row)
        'Genitiv', the corresponding tuple would look like:
        ({'Singular', 'Genitiv'}, 'der Barmherzigkeit')
        .

        The first row is considered a header row, if it's inside of <thead>
        html tag. The first column is considered a header column if the
        corresponding cells are <th> html nodes.
        """
        left_header = []
        top_header = None
        table_content = []
        table_name = ''

        # convert table html node to raw table (list of lists) and optional
        # left and top headers (also lists)
        if table_node.thead:
            top_header = [clear_text(t.text)
                          for t in table_node.thead.find_all('th')]

        for row in table_node.tbody.find_all('tr'):
            if row.th:
                left_header.append(clear_text(row.th.text))

            tds = row.find_all('td')
            table_content.append([clear_text(td.text) for td in tds])

        if top_header and left_header:
            table_name = top_header[0]
            top_header = top_header[1:]

        # sanitize missing cells
        last_nonempty_cell = ''
        for i, cell in enumerate(left_header):
            if cell == '':
                left_header[i] = last_nonempty_cell
            else:
                last_nonempty_cell = cell

        # convert left, top, and table headers to sets for easier tagging
        if left_header:
            left_header = [{cell} for cell in left_header]
        else:
            left_header = [set() for _ in table_content]
        if top_header:
            top_header = [{cell} for cell in top_header]
        else:
            top_header = [set() for _ in table_content[0]]
        table_tag = {table_name} if table_name else set()

        if table_name in [PRASENS, PRATERITUM]:
            person_tags = [{PERSON_1}, {PERSON_2}, {PERSON_3}]
        else:
            person_tags = [set(), set(), set()]

        # create a list of tagged strings
        tagged_strings = []
        for row, row_tag, person_tag \
                in zip(table_content, left_header, cycle(person_tags)):
            for cell, col_tag in zip(row, top_header):
                taglist = table_tag \
                    .union(row_tag) \
                    .union(col_tag) \
                    .union(person_tag)
                tagged_strings.append((taglist, cell))
        return tagged_strings