def wrap_picture( soup: bs4.BeautifulSoup, img_tag: bs4.element.Tag, image: Image, domain: Url, ) -> None: picture_tag = soup.new_tag('picture') source_tag = soup.new_tag('source') picture_tag['class'] = img_tag.get('class') or "" img_tag['class'] = [] img_tag.wrap(picture_tag) img_tag.insert_before(source_tag) source_tag.attrs.update({ # type: ignore[attr-defined] 'type': 'image/webp', 'srcset': write_srcset( domain, image.path.with_suffix('.webp'), image.thumbnail_widths, ), 'sizes': img_tag.get('sizes', '') # type: ignore[dict-item] })
def _parse_one_person(self, person: bs4.element.Tag, _stage: str, i: int) -> Dict: _trophy, _, _level, _name = [ i.text.strip() for i in person.find_all("div", attrs={"style": self._style_font_xs}) ] res = { "trophy": int(_trophy), "level": int(_level), "name": _name, "hero": self.hero_map[person.find("img").get("src")], "playerId": person.get("href").split("/")[-1], "isTeammate": False, } if _stage == "Duo Showdown": res["group"] = i // 2 res["is_mvp"] = np.nan elif _stage == "Showdown": res["group"] = i res["is_mvp"] = np.nan else: res["group"] = np.nan res["is_mvp"] = person.find("img", attrs={"src": self._img_mvp}) is not None return res
def link_to_folder(link: bs4.element.Tag) -> str: raw_url: str = link.get("href", default="") url: ParseResult = urlparse(raw_url) if url.scheme or url.netloc: return "" url_path: str = posixpath.normpath(url.path) if "/" in url_path or url_path == "." or url_path == "..": return "" return url_path
def stylesheet_filter_func(tag: bs4.element.Tag) -> bool: """ Filter function for stylesheet tags only """ if tag.has_attr("rel"): is_css = "stylesheet" in tag.get("rel") enabled = not tag.has_attr("disabled") if is_css and enabled: return True return False
def get_seclist_for_el(el: bs4.element.Tag, ref_map: Dict, default_seclist: List) -> List[Tuple]: """ Build sec_list for tag :param el: :param ref_map: :param default_seclist: :return: """ if type(el) == NavigableString: return default_seclist sec_id = el.get('s2orc_id', None) if sec_id: return build_section_list(sec_id, ref_map) else: return default_seclist
def extract_info_from_post(post: bs4.element.Tag) -> dict: """ Extract attributes of interest from tags within a single post :type post: bs4.element.Tag :param post: a HTML Tag representing a single post :rtype: dict :return: dict containing the data extracted """ result_title_tag = post.find('a', class_='result-title hdrlnk') href = result_title_tag.get('href') data_id = result_title_tag.get('data-id') time_tag = post.find('time', class_='result-date') posted_at = time_tag.get('datetime') repost_of = post.get('data-repost-of') return { 'href': href, 'data_id': data_id, 'posted_at': posted_at, 'repost_of': repost_of }
def dblp_contrib_single( self, elem: bs4.element.Tag) -> fatcat_openapi_client.ReleaseContrib: """ In the future, might try to implement creator key-ificiation and lookup here. Example rows: <author>Michael H. Böhlen</author> <author orcid="0000-0002-4354-9138">Nicolas Heist</author> <author orcid="0000-0001-9108-4278">Jens Lehmann 0001</author> """ creator_id = None extra = None raw_name = clean_str(elem.text) # remove number in author name, if present if raw_name and raw_name.split()[-1].isdigit(): raw_name = " ".join(raw_name.split()[:-1]) if elem.get("orcid"): orcid_val = elem["orcid"] if isinstance(orcid_val, list): orcid = clean_orcid(orcid_val[0]) else: orcid = clean_orcid(orcid_val) if orcid: creator_id = self.lookup_orcid(orcid) if not creator_id: extra = dict(orcid=orcid) return fatcat_openapi_client.ReleaseContrib( raw_name=raw_name, creator_id=creator_id, extra=extra, )
def get_sections_from_div(el: bs4.element.Tag, sp: BeautifulSoup, parent: Optional[str], faux_max: int) -> Dict: """ Process section headers for one div :param el: :param sp: :return: """ sec_map_dict = dict() el_ref_id = None # process divs with ids if el.get('id', None): sec_num = el.get('id-text', None) if 'cid' in el.get('id'): el_ref_id = el.get('id').replace('cid', 'SECREF') elif 'uid' in el.get('id'): el_ref_id = el.get('id').replace('uid', 'SECREFU') else: print('Unknown ID type!', el.get('id')) raise NotImplementedError el['s2orc_id'] = el_ref_id sec_map_dict[el_ref_id] = { "num": sec_num, "text": get_section_name(el), "ref_id": el_ref_id, "parent": parent } # process divs without section numbers elif el.get('rend') == "nonumber": el_ref_id = f'SECREF{faux_max}' el['s2orc_id'] = el_ref_id sec_map_dict[el_ref_id] = { "num": None, "text": get_section_name(el), "ref_id": el_ref_id, "parent": parent } # process sub elements for sub_el in el.find_all(recursive=False): if sub_el.name.startswith('div'): # add any unspecified keys sec_keys = [ int(k.strip('SECREF')) for k in sec_map_dict.keys() if k and k.strip('SECREF').isdigit() ] faux_max = max(sec_keys + [faux_max]) + 1 sec_map_dict.update( get_sections_from_div(sub_el, sp, el_ref_id if el_ref_id else parent, faux_max)) elif sub_el.name == 'p' or sub_el.name == 'proof': if sub_el.get('id', None): sec_num = sub_el.get('id-text', sub_el.hi.get('id-text', None)) if 'cid' in sub_el.get('id'): sub_el_ref_id = sub_el.get('id').replace('cid', 'SECREF') elif 'uid' in sub_el.get('id'): sub_el_ref_id = sub_el.get('id').replace('uid', 'SECREFU') else: print('Unknown ID type!', sub_el.get('id')) raise NotImplementedError sub_el['s2orc_id'] = sub_el_ref_id sec_map_dict[el_ref_id] = { "num": sec_num, "text": sub_el.head.text if sub_el.head else sub_el.hi.text if sub_el.hi else "", "ref_id": sub_el_ref_id, "parent": el_ref_id if el_ref_id else parent } return sec_map_dict
def _parse_link(el: bs4.element.Tag) -> Link: onclick = el.get('onclick') if el.has_attr('onclick') else None return Link(text=el.text, href=None, onclick=onclick)