def search(word, exact=True, return_words=True): url = SEARCH_URL_FORM.format(word=word) response = requests.get(url) soup = bs4.BeautifulSoup(response.text, 'html.parser') main_sec = soup.find('section', id='block-duden-tiles-0') a_tags = [h2.a for h2 in main_sec.find_all('h2')] urlnames = [a['href'].split('/')[-1] for a in a_tags if (not exact) or clear_text(a.text) == word] if return_words: return [get(urlname) for urlname in urlnames] else: return urlnames
def _table_node_to_tagged_cells(self, table_node): """ Takes a table HTML node and returns the list of table cell strings tagged using the table top and left header (optionally using the table name found in the upper-leftmost cell). The return type is a list of 2-tuples: [(tag_set, text), ...] where text is a string taken from the cell, and tag_set is a set of strings (tags). If e.g. cell in the 3rd row and 2nd column with the text 'der Barmherzigkeit', has its top_header tag (1st row, 2nd column) 'Singular' and its left_header tag (1st column, 3rd row) 'Genitiv', the corresponding tuple would look like: ({'Singular', 'Genitiv'}, 'der Barmherzigkeit') . The first row is considered a header row, if it's inside of <thead> html tag. The first column is considered a header column if the corresponding cells are <th> html nodes. """ left_header = [] top_header = None table_content = [] table_name = '' # convert table html node to raw table (list of lists) and optional # left and top headers (also lists) if table_node.thead: top_header = [clear_text(t.text) for t in table_node.thead.find_all('th')] for row in table_node.tbody.find_all('tr'): if row.th: left_header.append(clear_text(row.th.text)) tds = row.find_all('td') table_content.append([clear_text(td.text) for td in tds]) if top_header and left_header: table_name = top_header[0] top_header = top_header[1:] # sanitize missing cells last_nonempty_cell = '' for i, cell in enumerate(left_header): if cell == '': left_header[i] = last_nonempty_cell else: last_nonempty_cell = cell # convert left, top, and table headers to sets for easier tagging if left_header: left_header = [{cell} for cell in left_header] else: left_header = [set() for _ in table_content] if top_header: top_header = [{cell} for cell in top_header] else: top_header = [set() for _ in table_content[0]] table_tag = {table_name} if table_name else set() if table_name in [PRASENS, PRATERITUM]: person_tags = [{PERSON_1}, {PERSON_2}, {PERSON_3}] else: person_tags = [set(), set(), set()] # create a list of tagged strings tagged_strings = [] for row, row_tag, person_tag \ in zip(table_content, left_header, cycle(person_tags)): for cell, col_tag in zip(row, top_header): taglist = table_tag \ .union(row_tag) \ .union(col_tag) \ .union(person_tag) tagged_strings.append((taglist, cell)) return tagged_strings