Exemple #1
0
    def validate_and_extract(self, node: bs4.element):
        if node.name == 'a' \
                and node.has_attr('data-pin-do') \
                and node['data-pin-do'] == 'embedPin'\
                and node.has_attr('href'):
            return EPinterestPin(node['href'])

        return None
Exemple #2
0
    def validate_and_extract(self, node: bs4.element):
        if node.name == 'img' and node.has_attr('src'):
            return self.__create_image(node)
        if node.name == 'figure':
            img_tag = node.find('img')
            if img_tag and img_tag.has_attr('src'):
                return self.__create_image(img_tag, node.find('figcaption'))

        return None
    def get_normal_price(self, product_element: element) -> float:
        priceStr = product_element.select_one(
            ".price-box .old-price span.price-old")
        if priceStr is None:
            priceStr = product_element.select_one(
                ".price-box .regular-price span.price")

        if priceStr:
            return float(self.parse_price(priceStr.text))

        return None
Exemple #4
0
    def validate_and_extract(self, node: bs4.element):
        if isinstance(node, bs4.element.Tag) \
                and node.has_attr('class') \
                and ('twitter-tweet' in node['class']
                     or 'twitter-tweet-rendered' in node['class']):

            tweet_a_tag = node.find_all('a')

            if tweet_a_tag and tweet_a_tag[-1].has_attr('href'):
                tweet_url = tweet_a_tag[-1]['href']
                tweet_id = tweet_url.split('/')[-1].split('?')[0]
                return ETweet(tweet_id)

        return None
Exemple #5
0
    def __init__(self, card: box_list_element):
        self.name = card.find(class_='box_card_name').get_text().strip()
        self.text = card.find(class_='box_card_text').get_text(
            separator='\n').strip()

        self.pendulum_text = card.find(class_='box_card_pen_effect')
        if self.pendulum_text:
            self.pendulum_text = self.pendulum_text.get_text(
                separator='\n').strip()
        else:
            self.pendulum_text = ''

        self.ygo_db_url = 'https://www.db.yugioh-card.com' + card.find(
            class_='link_value').get('value')
def procesaTablaBajas(tablaBajas: bs4.element,
                      traduccionesConocidas: dict) -> dict:
    auxTraducciones = traduccionesConocidas or dict()
    result = defaultdict(dict)

    for row in tablaBajas.find("tbody").find_all("tr"):
        tds = list(row.find_all("td"))

        data = dict()

        link = tds[1].find("a").attrs['href']
        data['URL'] = MergeURL(URL_BASE, link)
        data['id'] = getObjID(link, 'ver')
        data['activo'] = False

        data['nombre'] = set().union(auxTraducciones.get(data['id'], set()))
        data['dorsal'] = row.find("td", {"class": "dorsal"}).get_text().strip()
        nuevosNombres = {
            sp.get_text().strip()
            for sp in row.find("td", {
                "class": "jugador"
            }).find_all("span")
        }
        data['nombre'].update(nuevosNombres)

        posics = {tds[2].find("span").get_text().strip()}

        destClass = 'tecnicos' if "ENT" in posics else 'jugadores'
        result[destClass][data['id']] = data

    return result
Exemple #7
0
def _parse_one_oakland_chart(chart: bs4_element):
    """Parse one of the charts on the Oakland page"""
    title_el = chart.find('div', attrs={'class': 'chart-vertical-title'})
    title = title_el.text

    data_els = [el for el in chart.find_all('li') if 'title' in el.attrs]
    months = []
    teus = []
    for el in data_els:
        month = pd.to_datetime(el.attrs['title'], format='%b').month
        num_el = el.find('span', attrs={'class': 'number'})
        num = np.nan if len(num_el.text) == 0 else np.float(num_el.text.replace(',', ''))
        months.append(month)
        teus.append(num)

    return title, months, teus
    def validate_and_extract(self, node: bs4.element):
        if isinstance(node, bs4.element.Tag) \
                and node.name == 'iframe' and node.has_attr('src')\
                and node['src'].startswith('https://www.youtube.com/embed/'):
            return EYouTubeVideo(self.__get_youtube_video_id(node['src']))

        return None
    def validate_and_extract(self, node: bs4.element):
        if node.name == 'iframe' and node.has_attr('src')\
                and utils.has_domain(node['src'],
                                     r'^https://www\.youtube\.com/embed'):
            return EYouTubeVideo(self.__get_youtube_video_id(node['src']))

        return None
Exemple #10
0
 async def _get_images(self,
                       data: element,
                       title: str,
                       max_images: Optional[int] = None) -> element:
     images = []
     img_urls = []
     img_tags = data.find_all('img')
     if max_images:
         img_tags = img_tags[:max_images]
     for img_tag in img_tags:
         if 'alt' in img_tag.attrs:
             if img_tag.attrs['alt'] in self._BANNED_ALT:
                 continue
         img_url = img_tag.attrs['src'].split('?')[0]
         if img_url in img_urls:
             continue
         img_urls.append(img_url)
         if not urllib.parse.urlparse(img_url).netloc:
             img_url = urllib.parse.urljoin(self._DOMAIN, img_url)
         image = await self._get_file_value_object(
             url=img_url,
             pretty_name=title,
             filename_unique=self._FILENAME_UNIQUE,
             public_url=self._PUBLIC_URL)
         images.append(image)
     return images
Exemple #11
0
def get_cells_from_row(row: bs4.element) -> list:
    """
    Get text from cells in the given BS4 table row.
    :param row: BS4 table row.
    """
    for column in row.find_all('td'):
        yield column.text.strip()
Exemple #12
0
    def validate_and_extract(self, node: bs4.element):
        """Validates if a tag is instagram post tag and
        returns the extracted data from the tag in EInstagramPost object"""

        if isinstance(node, bs4.element.Tag):
            if node.has_attr('class') \
                and ('instagram-media' in node['class']
                     or 'instagram-media-rendered' in node['class']):
                return EInstagramPost(
                    self.__get_instagram_shortcode(node.find('a')['href']))

            if node.name == 'iframe' \
                and node.has_attr('src') \
                    and node['src'].startswith('https://instagram.com/'):
                return EInstagramPost(
                    self.__get_instagram_shortcode(node['src']))

        return None
Exemple #13
0
    def validate_and_extract(self, node: bs4.element):
        """Validates if a tag is text tag and
        returns the extracted data from the text tag in Text object"""

        if isinstance(node, bs4.element.Tag):
            text_data = node.get_text().strip()
            if (node.name in TEXT_TAGS) \
                    and not utils.empty_text(text_data):
                text_type = node.name
                is_bold = (node.find('strong') or node.find('b')) \
                    and len(node.contents) <= MAX_CHILD
                text_content = Text(text_data, text_type, is_bold)
                return text_content
        elif isinstance(node, bs4.element.NavigableString) \
                and not utils.empty_text(node):
            text_content = Text(node.strip())
            return text_content

        return None
Exemple #14
0
    def validate_and_extract(self, node: bs4.element):
        if node.name == 'q' \
                and not utils.empty_text(node.text):
            cite = None
            if node.has_attr('cite'):
                cite = node['cite']
            quote = Quote(node.text, cite)
            return quote

        return None
Exemple #15
0
    def validate_and_extract(self, node: bs4.element):
        video_urls = list()
        if node.name == 'video':
            if node.has_attr('src'):
                video_urls.append(node['src'])
            elif node.contents:
                for child in node.contents:
                    if child.name == 'source'\
                            and child.has_attr('src'):
                        video_urls.append(child['src'])
            if video_urls:
                height, width = e_utils.get_media_size(node)
                return Video(video_urls, height, width)
        if node.name == 'embed' \
                and node.has_attr('src') \
                and VIDEO_EXTENSIONS_PATTERN.match(node['src']):
            video_urls.append(node['src'])
            height, width = e_utils.get_media_size(node)
            return Video(video_urls, height, width)

        return None
    def json_to_html(cls, bsObj: element, json_obj, name=None):
        div = bsObj.new_tag('div')
        if not name == None:
            div.attrs['class'] = name
        else:
            div.attrs['class'] = 'json_list'

        if isinstance(json_obj, dict):
            for child_name in json_obj.keys():
                div.append(cls.json_to_html(bsObj, json_obj[child_name], child_name))
        elif isinstance(json_obj, list):
            for child in json_obj:
                div.append(cls.json_to_html(bsObj, child))
        elif isinstance(json_obj, str):
            div.name = 'span'
            div.attrs['class'] = name if name != None else 'str'
            div.append(cls._handle_str(json_obj))
        return div
    def json_to_html(cls, bsObj: element, json_obj, name=None):
        div = bsObj.new_tag('div')
        if not name == None:
            div.attrs['class'] = name
        else:
            div.attrs['class'] = 'json_list'

        if isinstance(json_obj, dict):
            for child_name in json_obj.keys():
                div.append(
                    cls.json_to_html(bsObj, json_obj[child_name], child_name))
        elif isinstance(json_obj, list):
            for child in json_obj:
                div.append(cls.json_to_html(bsObj, child))
        elif isinstance(json_obj, str):
            div.name = 'span'
            div.attrs['class'] = name if name != None else 'str'
            div.append(cls._handle_str(json_obj))
        return div
Exemple #18
0
def get_entries_from_table(table: bs4.element) -> list:
    """
    Take a BS4 representation of a tournament entries table and return a list of lists, each internal list holding the
    last names of the team's members.
    :param table: Table of entries.
    :return: List of name lists.
    """
    for row in table.find_all('tr'):
        # Skip rows with fewer than three columns - they won't have partnership info.
        if len(row.find_all('td')) < 3:
            continue

        columns = list(get_cells_from_row(row))

        # Don't return TBA entries.
        if columns[2] == 'Names TBA':
            continue

        names = columns[2].replace('&', '').split()
        school = columns[0]

        yield Partnership(school, tuple(names))
    def __parse_row(self, row: element) -> None:
        data_list = row.find_all('td')
        self.code = data_list[0].text.strip()

        period = data_list[1].span.text.strip()
        if ' a ' in period:
            buy, sell = period.split(' a ')
        else:
            buy = period
            sell = None

        self.buy_date = datetime.strptime(buy, "%d/%m/%Y").date()
        if sell is not None:
            self.sell_date = datetime.strptime(sell, "%d/%m/%Y").date()
        else:
            self.sell_date = None

        self.buy_amount = int(data_list[2].text.strip())
        self.sell_amount = int(data_list[3].text.strip())
        self.buy_price = float(data_list[4].text.strip().replace(',', '.'))
        self.sell_price = float(data_list[5].text.strip().replace(',', '.'))

        self.position = data_list[7].text.strip()
        self.__initialized = True
 def get_image(self, product_element: element) -> str:
     return product_element.find("img")['src']
 def get_offer_price(self, product_element: element) -> float:
     priceStr = product_element.select_one(
         ".price-box p.special-price span.price")
     if priceStr:
         return float(self.parse_price(priceStr.text))
     return None
 def get_status(self, product_element: element) -> int:
     button = product_element.select_one(
         "div.salesperson-category-products li.first div.actions div.add-to-cart button.btn-cart"
     )
     return button is not None
 def get_url(self, product_element: element) -> str:
     return product_element.find("a")['href']
Exemple #24
0
 async def _remove_non_text_tags(data: element) -> element:
     for script in data.find_all('script'):
         script.decompose()
     return data