def parse_thread_page(el: bs4.element.Tag) -> AttrDict: out = AttrDict() out.user = el.select('.postprofile dt')[0].text.strip() out.body_html = str(el.select('.content')[0]).strip() out.body_text = el.select('.content')[0].text.strip() out.date = el.select('.postbody .author')[0].text.strip() return out
def parse_link(link: bs4.element.Tag, domain: str) -> AttrDict: out = AttrDict() out.title = link.select('a:nth-of-type(1)')[0].text out.views = link.select('.views')[0].text.replace('Zugriffe', '').strip() out.answers = link.select('.posts')[0].text.replace('Antworten', '').strip() out.date = link.select('a:nth-of-type(3)')[0].text out.url = domain + link.select('a:nth-of-type(1)')[0].attrs['href'].replace('./', '/') return out
def _movie_item(item_soup: bs4.element.Tag) -> MovieItem: litpic_soup = item_soup.find('div', attrs={'class': 'litpic'}) title_soup = item_soup.find('div', attrs={'class': 'title'}) title_p_soups = title_soup.find_all('p') litpic_url = litpic_soup.a.img['src'] subject_url = _url(litpic_soup.a['href']) title = title_p_soups[0].b.getText() other = title_p_soups[1].a.getText() info = title_p_soups[2].getText() star = title_p_soups[3].getText() return MovieItem(litpic_url, subject_url, title, other, info, star)
def _parse_column(gatherer_column: bs4.element.Tag) -> GathererCard: """Parse a single gatherer page 'rightCol' entry.""" label_to_values = { row.find("div", class_="label") .getText(strip=True) .rstrip(":"): row.find("div", class_="value") for row in gatherer_column.find_all("div", class_="row") } card_name = label_to_values["Card Name"].getText(strip=True) card_types = label_to_values["Types"].getText(strip=True) flavor_lines = [] if "Flavor Text" in label_to_values: for flavorbox in label_to_values["Flavor Text"].find_all( "div", class_="flavortextbox" ): flavor_lines.append(flavorbox.getText(strip=True)) text_lines = [] if "Card Text" in label_to_values: for textbox in label_to_values["Card Text"].find_all( "div", class_="cardtextbox" ): text_lines.append(_replace_symbols(textbox).getText().strip()) return GathererCard( card_name=card_name, original_types=card_types, original_text="\n".join(text_lines).strip() or None, flavor_text="\n".join(flavor_lines).strip() or None, )
def get_pronounce(p: bs4.element.Tag): return list( map( lambda x: re.match(r'(.*)(\[.*\])', x).groups(), p.find('ul').text.strip().split() ) )
def get_first_detail_annuncio(annuncio: bs4.element.Tag) -> tuple: """ Ottiene il codice dell'annuncio e il link della pagina di dettaglio dell'annuncio Returns: -------- code_annuncio: str stringa che rappresenta il codice dell'annuncio link_annuncio: str link alla pagina di dettaglio dell'annuncio """ code_annuncio = annuncio.find('dd').text link_annuncio = annuncio.find_all('a')[-1]['href'] return code_annuncio, link_annuncio
def _get_graphic_url(content: bs4.element.Tag) -> str or None: """ Get URL to the graphic. """ a = content.find('a', {'target': '_blank'}) try: link = a['href'] except (KeyError, TypeError, AttributeError): return return f"{BASE_RNC_URL}/{link}"
def _unplayed_games(M: int, N: int, table: bs4.element.Tag) -> pd.DataFrame: return pd.DataFrame(data=[[ not td.text or td.has_attr('class') and td['class'][0] == 'unplayed' for td in tr.find_all('td')[-N:] ] for tr in table.find_all('tr')[4:4 + M]], columns=pd.MultiIndex.from_tuples([ ('Unplayed', str(n + 1)) for n in range(N) ]))
def define_tr_to_str(tr: bs4.element.Tag) -> str: for a in tr.find_all("a"): a["href"] = BASE_API_URL + a["href"] data = (tr.find("td", class_="header").string, tr.find("td", class_="description")) contents = data[1].contents if len(contents) > 0: if len(contents) > 1 and "\n" not in contents[0]: description = tomd.convert( f"<p>{''.join([str(item) for item in contents[:-1]])}</p>" ).strip() else: description = contents[0].split('\n')[0].strip() return f"`{data[0]}` - {description}" else: return f"`{data[0]}`"
def _analyse_version(version: bs4.element.Tag, season: int, episode: int): # Only analyse English rows for row in version.find_all("div", attrs={"class": "row"}): if row.find_all("span", attrs={"class": "flag-icon flag-icon-gb"}): # Only analyse subtitles which are complete if row.find_all(text="Completed"): if _download_srt(row, season, episode): return True
def find_courses_from_section(self, section: bs4.element.Tag): if section is None: return [] else: course_sections = section.find_all(class_='m-single-course-top-row') courses = list(map(lambda x: x.find('span'), course_sections)) return list(map(lambda x: x.string, courses))
def replace_ref_tokens(sp: BeautifulSoup, el: bs4.element.Tag, ref_map: Dict): """ Replace all references in element with special tokens :param sp: :param el: :param ref_map: :return: """ # replace all citations with cite keyword for cite in el.find_all('cit'): try: target = cite.ref.get('target').replace('bid', 'BIBREF') cite.replace_with(sp.new_string(f" {target} ")) except AttributeError: print('Attribute error: ', cite) continue # replace all non citation references for rtag in el.find_all('ref'): try: if rtag.get('target') and not rtag.get('target').startswith('bid'): if rtag.get('target').startswith('cid'): target = rtag.get('target').replace('cid', 'SECREF') elif rtag.get('target').startswith('uid'): if rtag.get('target').replace('uid', 'FIGREF') in ref_map: target = rtag.get('target').replace('uid', 'FIGREF') elif rtag.get('target').replace('uid', 'TABREF') in ref_map: target = rtag.get('target').replace('uid', 'TABREF') elif rtag.get('target').replace('uid', 'EQREF') in ref_map: target = rtag.get('target').replace('uid', 'EQREF') elif rtag.get('target').replace('uid', 'FOOTREF') in ref_map: target = rtag.get('target').replace('uid', 'FOOTREF') elif rtag.get('target').replace('uid', 'SECREFU') in ref_map: target = rtag.get('target').replace('uid', 'SECREFU') else: target = rtag.get('target').upper() else: print('Weird ID!') target = rtag.get('target').upper() rtag.replace_with(sp.new_string(f" {target} ")) except AttributeError: print('Attribute error: ', rtag) continue return el
def link_to_folder(link: bs4.element.Tag) -> str: raw_url: str = link.get("href", default="") url: ParseResult = urlparse(raw_url) if url.scheme or url.netloc: return "" url_path: str = posixpath.normpath(url.path) if "/" in url_path or url_path == "." or url_path == "..": return "" return url_path
def get_text_author(quote: bs4.element.Tag) -> Optional[str]: """Return author from html tag class as string""" try: author = quote.find(class_="authorOrTitle").text author = author.replace(",", "") author = author.replace("\n", "") return author.strip() except AttributeError: return None
def clean_tags(el: bs4.element.Tag): """ Replace all tags with lowercase version :param el: :return: """ for sub_tag in SUBSTITUTE_TAGS: for sub_el in el.find_all(sub_tag): sub_el.name = sub_tag.lower()
def _download_srt(row: bs4.element.Tag, season: int, episode: int): for link in row.find_all("a", attrs={"rel": "nofollow"}): url = f"{BASE_URL}/{link['href']}" con = requests.get(url) s = f"0{season}" if season < 10 else str(season) e = f"0{episode}" if episode < 10 else str(episode) with open(f"{FOLDER}/{NAME}.S{s}E{e}.srt", "wb") as f: f.write(con.content) return True
def get_text_tags(quote: bs4.element.Tag) -> Optional[list]: """Return tags from html tag class as string""" try: tags = quote.find(class_="greyText smallText left").text tags = [x.strip() for x in tags.split(',')] tags = tags[1:] return tags.strip() except AttributeError: return None
def get_text_title(quote: bs4.element.Tag) -> Optional[str]: """Return title from html tag class as string""" try: title = quote.find(class_="authorOrTitle") title = title.nextSibling.nextSibling.text title = title.replace("\n", "") return title.strip() except AttributeError: return None
def from_td_tag(cls, quality: Quality, tag: bs4.element.Tag): try: s = tag.find('table').find('tr').find_all('td')[1].find('div').text matches = _price_regex.match(s) return PriceData(quality=quality, quantity=int(matches[1]), price=float(matches[2])) except Exception: pass
def row_to_transaction(row: bs4.element.Tag) -> Transaction: """ Convert HTML string of one <tr> with multiple <td> entries into named tuple. :param row: HTML string of the entire <tr> tag, including <td> tags :return: Transaction named tuple with corresponding entries """ args = (elem.text.strip() for elem in row.find_all('td')) return Transaction(*args)
def parse_calendar_table(self, course_code, table: bs4.element.Tag) -> dict: data = {} calender_table_rows = table.find_all('tr', recursive=False)[1:] for row in calender_table_rows: day, row_data = self.parse_calendar_row(course_code, row) data[day] = row_data return data
def _parse_doc(self, doc: bs4.element.Tag) -> List: """ Parse one document. """ res = [] for example in doc.find_all('table', {'class': 'para'}): new_ex = self._parse_example(example) res += [new_ex] self._add_wordforms(new_ex.found_wordforms) return res
def get_explain(e: bs4.element.Tag): def f(ks): return ( 'pos' if 'pos_button' in ks else 'explain' if 'dictionaryExplanation' in ks else '?') return [ (f(m.attrs['class']), m.text) for n in e.select('ul > li') for m in n.select('div')]
def process_match_details(details: bs.element.Tag) -> dict: hour, goals_home, _, goals_visitor, more_info = details.find_all('td') hour = {"hour": hour.text.replace('h', ':').strip()} goals_home_team = extract_goals_time(goals_home, "goals_home_team") goals_visitor_team = extract_goals_time(goals_visitor, "goals_visitor_team") more_info = extract_more_info(more_info) return {**hour, **goals_home_team, **goals_visitor_team, **more_info}
def downstream_points(ts: int, table1: bs4.element.Tag, table2: bs4.element.Tag) \ -> typ.Generator[InfluxPoint, None, None]: global last_correct, last_uncorrect # Read upper table with power levels. _, channel, frequency, snr, modulation, power = \ table1.find('tbody')('tr', recursive=False) channel_ids = [int(td.text) for td in datacells(channel)] snrs = [float(td.text.split(' ')[0]) for td in datacells(snr)] modulations = [td.text.strip() for td in datacells(modulation)] power_levels = [float(td.text.split(' ')[0]) for td in datacells(power)] # Read lower table with codeword counts. _, _, _, correctable, uncorrectable = table2.find('tbody')('tr', recursive=False) correctables = [int(td.text) for td in datacells(correctable)] int_correctables = [ v - last_correct.get(channel_ids[i], v) for i, v in enumerate(correctables) ] uncorrectables = [int(td.text) for td in datacells(uncorrectable)] int_uncorrectables = [ v - last_uncorrect.get(channel_ids[i], v) for i, v in enumerate(uncorrectables) ] last_correct = {channel_ids[i]: v for i, v in enumerate(correctables)} last_uncorrect = {channel_ids[i]: v for i, v in enumerate(uncorrectables)} def field_set(i: int) -> InfluxSet: return { 'snr_db': snrs[i], 'modulation': modulations[i], 'power_dbmv': power_levels[i], 'interval_correctable_codewords': int_correctables[i], 'interval_uncorrectable_codewords': int_uncorrectables[i] } yield from (InfluxPoint(measurement='downstream', tag_set={'channel_id': cid}, field_set=field_set(i), timestamp=ts) for i, cid in enumerate(channel_ids))
def extract_match_data(tb_match: bs.element.Tag) -> dict: rows = tb_match.find_all('tr') if not rows: return {} header, details = rows[:2] header = process_match_header(header=header) details = process_match_details(details=details) return {**header, **details}
def extract_more_info(more_info: bs.element.Tag) -> dict: all_elements = more_info.find_all('b') audience = [i for i in all_elements if i.text == 'Público:'] audience = audience[0].nextSibling.strip().replace(' ', '').replace( '.', '') if audience else "" income = [i for i in all_elements if i.text == 'Renda:'] income = income[0].nextSibling.strip().replace(',', '.') if income else "" info = {"audience": audience, "income": income} return info
def process_match_header(header: bs.element.Tag) -> dict: day_month, home, result, visitor, stadium = header.find_all('td') day_month = {"day_month": day_month.text.strip()} home = extract_team_data(team=home, st_home=True) result = {"result": result.text.strip()} visitor = extract_team_data(team=visitor, st_home=False) stadium = extract_stadium_data(stadium=stadium) return {**day_month, **home, **result, **visitor, **stadium}
def get_noti_id(self, bs4_item_tag: bs4.element.Tag): noti_url = bs4_item_tag.find('link').get_text() url_params = [param for param in noti_url.split('?')[1].split('&') ] # 쿼리 파라메터 추출 for p in url_params: if 'nttSn' in p: return int(p.split('=')[1]) raise ValueError('Could not find post number (sttSn) in Link')
def get_other_page_urls_from_overview_page_stepbridge_my_results(page_soup: bs4.element.Tag) -> list: try: pagination_tag = page_soup.find('ul', {'class': 'pagination'}) page_items = pagination_tag.find_all('li', {'class': 'page-item'}) link_items = [page_item.find('a') for page_item in page_items if page_item.find('a') is not None] page_urls = [link_item['href'] for link_item in link_items] unique_page_urls = list(OrderedDict.fromkeys(page_urls)) return unique_page_urls except AttributeError: return []
def scale_an_image_with_css(img: bs4.element.Tag, height: int) -> None: assert (height >= 0) if 'style' not in img.attrs: img.attrs['style'] = 'max-height:{height:d}px;'.format(height=height) return None style = img.attrs['style'] m = re.search(r'max-height:[^;]*', style) if not m: if not style.endswith(';'): style += ';' style += 'max-height:{height:d}px;'.format(height=height) img.attrs['style'] = style return None img.attrs['style'] = re.sub( r'(?P<prefix>.*max-height:)[^;]*(.*)', r'\g<prefix>' + '{height:d}'.format(height=height) + r'px\2', style)
def _parse_previous_price(product: bs4.element.Tag) -> decimal.Decimal: """ Parse the fragment with the previous product price. If such fragment doesn't exist, assume there is no price reduction. """ previousprice_tag = product.find("span", class_="as-price-previousprice") if previousprice_tag: return _extract_price(previousprice_tag.get_text()) else: return _parse_current_price(product)
def contains_more_details(self, element: bs4.element.Tag): more_detail_div = element.find("div", class_="TJUuge") if more_detail_div is None: return False spans = more_detail_div.findAll("span") for span in spans: if span.text == "Докладніше": return True return False
def _expected(pid: int, table: bs4.element.Tag) -> pd.DataFrame: return ( pd.read_html(str(table), header=[1, 2])[0].assign(pid=pid) # make 'pid' the first column .pipe(lambda df: df. loc[:, df.columns.to_list()[-1:] + df.columns.to_list()[:-1]]). assign(Unplayed=pd.Series([ td['class'][0] == 'unplayed' if td.has_attr('class') else np.nan for tr in table.find_all('tr')[3:] for td in tr.find_all('td')[-1:] ])))
def _get_text(tag: bs4.element.Tag) -> str: """ Get pretty text from example and remove from there duplicate spaces. Here it is assumed, that all examples have text. """ # using 'findall' method removes punctuation marks txt = tag.get_text() # remove duplicate spaces return clean_text_up(txt)
def _parse_media(self, media: bs4.element.Tag) -> Tuple[str, str]: """ Get link to the media file and filepath. """ try: media_link = media.find('a')['href'] except Exception: raise media_link, filename = media_link.split('?name=') return media_link, self.MEDIA_FOLDER / filename
def get_grammar(d: bs4.element.Tag): s = ('div#web ol.searchCenterMiddle ' 'div.dictionaryWordCard > ul > li') return list(map(text, d.select(s)))
def play_items_from_log_entry(entry: bs4.element.Tag) -> bs4.element.ResultSet: return entry.find_all("td") def player(play_text_: str) -> str: return ' '.join(play_text_.split()[:2])