def validate_and_extract(self, node: bs4.element): if node.name == 'a' \ and node.has_attr('data-pin-do') \ and node['data-pin-do'] == 'embedPin'\ and node.has_attr('href'): return EPinterestPin(node['href']) return None
def validate_and_extract(self, node: bs4.element): if node.name == 'img' and node.has_attr('src'): return self.__create_image(node) if node.name == 'figure': img_tag = node.find('img') if img_tag and img_tag.has_attr('src'): return self.__create_image(img_tag, node.find('figcaption')) return None
def get_normal_price(self, product_element: element) -> float: priceStr = product_element.select_one( ".price-box .old-price span.price-old") if priceStr is None: priceStr = product_element.select_one( ".price-box .regular-price span.price") if priceStr: return float(self.parse_price(priceStr.text)) return None
def validate_and_extract(self, node: bs4.element): if isinstance(node, bs4.element.Tag) \ and node.has_attr('class') \ and ('twitter-tweet' in node['class'] or 'twitter-tweet-rendered' in node['class']): tweet_a_tag = node.find_all('a') if tweet_a_tag and tweet_a_tag[-1].has_attr('href'): tweet_url = tweet_a_tag[-1]['href'] tweet_id = tweet_url.split('/')[-1].split('?')[0] return ETweet(tweet_id) return None
def __init__(self, card: box_list_element): self.name = card.find(class_='box_card_name').get_text().strip() self.text = card.find(class_='box_card_text').get_text( separator='\n').strip() self.pendulum_text = card.find(class_='box_card_pen_effect') if self.pendulum_text: self.pendulum_text = self.pendulum_text.get_text( separator='\n').strip() else: self.pendulum_text = '' self.ygo_db_url = 'https://www.db.yugioh-card.com' + card.find( class_='link_value').get('value')
def procesaTablaBajas(tablaBajas: bs4.element, traduccionesConocidas: dict) -> dict: auxTraducciones = traduccionesConocidas or dict() result = defaultdict(dict) for row in tablaBajas.find("tbody").find_all("tr"): tds = list(row.find_all("td")) data = dict() link = tds[1].find("a").attrs['href'] data['URL'] = MergeURL(URL_BASE, link) data['id'] = getObjID(link, 'ver') data['activo'] = False data['nombre'] = set().union(auxTraducciones.get(data['id'], set())) data['dorsal'] = row.find("td", {"class": "dorsal"}).get_text().strip() nuevosNombres = { sp.get_text().strip() for sp in row.find("td", { "class": "jugador" }).find_all("span") } data['nombre'].update(nuevosNombres) posics = {tds[2].find("span").get_text().strip()} destClass = 'tecnicos' if "ENT" in posics else 'jugadores' result[destClass][data['id']] = data return result
def _parse_one_oakland_chart(chart: bs4_element): """Parse one of the charts on the Oakland page""" title_el = chart.find('div', attrs={'class': 'chart-vertical-title'}) title = title_el.text data_els = [el for el in chart.find_all('li') if 'title' in el.attrs] months = [] teus = [] for el in data_els: month = pd.to_datetime(el.attrs['title'], format='%b').month num_el = el.find('span', attrs={'class': 'number'}) num = np.nan if len(num_el.text) == 0 else np.float(num_el.text.replace(',', '')) months.append(month) teus.append(num) return title, months, teus
def validate_and_extract(self, node: bs4.element): if isinstance(node, bs4.element.Tag) \ and node.name == 'iframe' and node.has_attr('src')\ and node['src'].startswith('https://www.youtube.com/embed/'): return EYouTubeVideo(self.__get_youtube_video_id(node['src'])) return None
def validate_and_extract(self, node: bs4.element): if node.name == 'iframe' and node.has_attr('src')\ and utils.has_domain(node['src'], r'^https://www\.youtube\.com/embed'): return EYouTubeVideo(self.__get_youtube_video_id(node['src'])) return None
async def _get_images(self, data: element, title: str, max_images: Optional[int] = None) -> element: images = [] img_urls = [] img_tags = data.find_all('img') if max_images: img_tags = img_tags[:max_images] for img_tag in img_tags: if 'alt' in img_tag.attrs: if img_tag.attrs['alt'] in self._BANNED_ALT: continue img_url = img_tag.attrs['src'].split('?')[0] if img_url in img_urls: continue img_urls.append(img_url) if not urllib.parse.urlparse(img_url).netloc: img_url = urllib.parse.urljoin(self._DOMAIN, img_url) image = await self._get_file_value_object( url=img_url, pretty_name=title, filename_unique=self._FILENAME_UNIQUE, public_url=self._PUBLIC_URL) images.append(image) return images
def get_cells_from_row(row: bs4.element) -> list: """ Get text from cells in the given BS4 table row. :param row: BS4 table row. """ for column in row.find_all('td'): yield column.text.strip()
def validate_and_extract(self, node: bs4.element): """Validates if a tag is instagram post tag and returns the extracted data from the tag in EInstagramPost object""" if isinstance(node, bs4.element.Tag): if node.has_attr('class') \ and ('instagram-media' in node['class'] or 'instagram-media-rendered' in node['class']): return EInstagramPost( self.__get_instagram_shortcode(node.find('a')['href'])) if node.name == 'iframe' \ and node.has_attr('src') \ and node['src'].startswith('https://instagram.com/'): return EInstagramPost( self.__get_instagram_shortcode(node['src'])) return None
def validate_and_extract(self, node: bs4.element): """Validates if a tag is text tag and returns the extracted data from the text tag in Text object""" if isinstance(node, bs4.element.Tag): text_data = node.get_text().strip() if (node.name in TEXT_TAGS) \ and not utils.empty_text(text_data): text_type = node.name is_bold = (node.find('strong') or node.find('b')) \ and len(node.contents) <= MAX_CHILD text_content = Text(text_data, text_type, is_bold) return text_content elif isinstance(node, bs4.element.NavigableString) \ and not utils.empty_text(node): text_content = Text(node.strip()) return text_content return None
def validate_and_extract(self, node: bs4.element): if node.name == 'q' \ and not utils.empty_text(node.text): cite = None if node.has_attr('cite'): cite = node['cite'] quote = Quote(node.text, cite) return quote return None
def validate_and_extract(self, node: bs4.element): video_urls = list() if node.name == 'video': if node.has_attr('src'): video_urls.append(node['src']) elif node.contents: for child in node.contents: if child.name == 'source'\ and child.has_attr('src'): video_urls.append(child['src']) if video_urls: height, width = e_utils.get_media_size(node) return Video(video_urls, height, width) if node.name == 'embed' \ and node.has_attr('src') \ and VIDEO_EXTENSIONS_PATTERN.match(node['src']): video_urls.append(node['src']) height, width = e_utils.get_media_size(node) return Video(video_urls, height, width) return None
def json_to_html(cls, bsObj: element, json_obj, name=None): div = bsObj.new_tag('div') if not name == None: div.attrs['class'] = name else: div.attrs['class'] = 'json_list' if isinstance(json_obj, dict): for child_name in json_obj.keys(): div.append(cls.json_to_html(bsObj, json_obj[child_name], child_name)) elif isinstance(json_obj, list): for child in json_obj: div.append(cls.json_to_html(bsObj, child)) elif isinstance(json_obj, str): div.name = 'span' div.attrs['class'] = name if name != None else 'str' div.append(cls._handle_str(json_obj)) return div
def json_to_html(cls, bsObj: element, json_obj, name=None): div = bsObj.new_tag('div') if not name == None: div.attrs['class'] = name else: div.attrs['class'] = 'json_list' if isinstance(json_obj, dict): for child_name in json_obj.keys(): div.append( cls.json_to_html(bsObj, json_obj[child_name], child_name)) elif isinstance(json_obj, list): for child in json_obj: div.append(cls.json_to_html(bsObj, child)) elif isinstance(json_obj, str): div.name = 'span' div.attrs['class'] = name if name != None else 'str' div.append(cls._handle_str(json_obj)) return div
def get_entries_from_table(table: bs4.element) -> list: """ Take a BS4 representation of a tournament entries table and return a list of lists, each internal list holding the last names of the team's members. :param table: Table of entries. :return: List of name lists. """ for row in table.find_all('tr'): # Skip rows with fewer than three columns - they won't have partnership info. if len(row.find_all('td')) < 3: continue columns = list(get_cells_from_row(row)) # Don't return TBA entries. if columns[2] == 'Names TBA': continue names = columns[2].replace('&', '').split() school = columns[0] yield Partnership(school, tuple(names))
def __parse_row(self, row: element) -> None: data_list = row.find_all('td') self.code = data_list[0].text.strip() period = data_list[1].span.text.strip() if ' a ' in period: buy, sell = period.split(' a ') else: buy = period sell = None self.buy_date = datetime.strptime(buy, "%d/%m/%Y").date() if sell is not None: self.sell_date = datetime.strptime(sell, "%d/%m/%Y").date() else: self.sell_date = None self.buy_amount = int(data_list[2].text.strip()) self.sell_amount = int(data_list[3].text.strip()) self.buy_price = float(data_list[4].text.strip().replace(',', '.')) self.sell_price = float(data_list[5].text.strip().replace(',', '.')) self.position = data_list[7].text.strip() self.__initialized = True
def get_image(self, product_element: element) -> str: return product_element.find("img")['src']
def get_offer_price(self, product_element: element) -> float: priceStr = product_element.select_one( ".price-box p.special-price span.price") if priceStr: return float(self.parse_price(priceStr.text)) return None
def get_status(self, product_element: element) -> int: button = product_element.select_one( "div.salesperson-category-products li.first div.actions div.add-to-cart button.btn-cart" ) return button is not None
def get_url(self, product_element: element) -> str: return product_element.find("a")['href']
async def _remove_non_text_tags(data: element) -> element: for script in data.find_all('script'): script.decompose() return data