def parse_row(self, obj: Element) -> dict: """Parse object for item information.""" return { "name": get_first_element(obj, self.ITEM_NAME).text, "media_type": get_first_element(obj, self.MEDIA_AND_CATEGORY_TYPE).find("a")[0].text, "category": get_first_element(obj, self.MEDIA_AND_CATEGORY_TYPE).find("a")[1].text, "seeders": obj.find(self.SEEDERS_AND_LEACHERS)[0].text, "leachers": obj.find(self.SEEDERS_AND_LEACHERS)[1].text, "magnet_link": get_first_element(obj, self.ITEM_MAGNET_LINK).links.pop(), "file_size": get_first_element(obj, "font.detDesc").text.split(",")[1], "vip_status": get_first_element(obj, self.UPLOADER_VIP), "trusted": get_first_element(obj, self.UPLOADER_TRUSTED), "href": get_first_element(obj, "a.detLink").links.pop(), "uploader": (obj.find("font.detDesc")[0].text.split("by")[-1].strip()), "comment_count": (0 if not obj.search(self.ITEM_COMMENTS) else obj.search(self.ITEM_COMMENTS).fixed[0]), }
def title_parse(element: Element) -> Optional[str]: title_element = element.find(".product-title", first=True) if title_element: return title_element.text title_element = element.find("#name", first=True) if title_element: return title_element.text
def price_discount_parse(element: Element) -> Optional[float]: if element.find(".discount-green", first=True): if element.find(".discount-green", first=True): return float( element.find(".discount-green", first=True).text[1:]) elif element.find(".price", first=True): return float(element.find(".price", first=True).text[1:])
def save_x_percent_in_cart_parse(element: Element) -> Optional[int]: if element.find("title", containing="in Cart", first=True): save_x_percent_in_cart_element = element.find("title", containing="in Cart", first=True) return int( parse_html_text_btw(save_x_percent_in_cart_element.text, "Save ", "% in Cart"))
def loyalty_credit_x_percent_parse(element: Element) -> Optional[int]: if element.find(".slanted-container", containing="Loyalty Credit", first=True): loyalty_credit_element = element.find(".slanted-container", containing="Loyalty Credit", first=True) if loyalty_credit_element: return int( loyalty_credit_element.text[:loyalty_credit_element.text. find("% Loyalty Credit")])
def product_code_parse(element: Element) -> Optional[str]: product_code_element = element.find("li", containing="Product Code", first=True) if product_code_element: return product_code_element.text[product_code_element.text. find(": ") + 2:]
def parse_flat(self, html: Element) -> None: # noqa: CCR001 """Get info about flat. Get all info about flat in given html element. :param html: Given element """ try: flat_url = html.find("a", first=True).attrs.get("href") flat_id = int(re.search(r"flat/(\d+)", flat_url).group(1)) location = html.xpath(".//a[@data-name='GeoLabel']/text()") if self.domain == "ekb": location = location[1:] city, district, *location = location location = " ".join(location) price = html.xpath(".//span[@data-mark='MainPrice']/text()", first=True) price = int(price.replace("₽", "").strip().replace(" ", "")) ppm = html.xpath(".//p[@data-mark='PriceInfo']/text()", first=True) ppm = int(ppm.replace("₽/м²", "").strip().replace(" ", "")) square = round(price / ppm, 2) if not Flat.exists(id=flat_id): Flat( id=flat_id, city=city, district=district, location=location, price=price, ppm=ppm, square=square, ) commit() except Exception as exc: print(exc) rollback()
def free_shipping_over_x_dollars_parse(element: Element) -> Optional[int]: free_shipping_over_element = element.find(".banner-alert", first=True) if free_shipping_over_element: if "Free Shipping\xa0for orders over" in free_shipping_over_element.text: return int(free_shipping_over_element. text[free_shipping_over_element.text.find("\n") + 2:])
def package_qty_parse(element: Element) -> Optional[str]: package_qty_element = element.find("li", containing="Package Quantity", first=True) if package_qty_element: return package_qty_element.text[package_qty_element.text. find(": ") + 2:]
def find_element_by_tag_name(element: Element, tag_name): try: Chrome.lock.acquire() result = element.find(tag_name, first=True) finally: Chrome.lock.release() return result
def expiration_date_parse(element: Element) -> Optional[datetime.datetime]: expiration_date_element = element.find("li", containing="Expiration Date", first=True) if expiration_date_element: expiration_date = parse_html_text_btw(expiration_date_element.text, "\n?\n", "\n") return datetime.datetime.strptime(expiration_date, "%B %Y")
def get_ride_info(ride: Element) -> dict: price = float( ride.find("span.num.currency-small-cents")[0].text.split("\xa0")[0]) departure_time = ride.find("div.ride-times")[0].text.split()[0] arrival_time = ride.find("div.ride-times")[0].text.split()[1] seats_str = ride.find("div.seats-notice") source = ride.find("div.departure-station-name")[0].text destination = ride.find("div.arrival-station-name")[0].text # duration = ride.find("div.duration") # departure = date + departure_time # arrival = departure + trip_length if seats_str and len(seats_str) > 0: seats_str = ride.find("div.seats-notice")[0].text matcher = re.match("(\d+)\s+\w+", seats_str) if matcher: seats_available = int(matcher.groups()[0]) else: seats_available = None return { "departure_datetime": departure_time, "arrival_datetime": arrival_time, # "2018-06-20 15:00:00", "source": source, "destinations": destination, "price": price, # in EUR - you can use https://api.skypicker.com/rates "type": "bus", # optional (bus/train) "source_id": 26323200, # optional (carrier’s id) "destination_id": 26383230, # optional (carrier’s id) "free_seats": seats_available, # optional "carrier": "Flixbus", # optional }
def shipping_weight_parse(element: Element) -> Optional[float]: shipping_weight_element = element.find("li", containing="Shipping Weight", first=True) if shipping_weight_element: shipping_weight, shipping_unit = parse_html_text_btw( shipping_weight_element.text, "\n?\n", "\n").split() if shipping_unit == "lbs": return float(shipping_weight)
def _get_jobs(self, section: Element, page_no: str) -> List[Job]: """Returns job postings within a company section :param section: html content to proces to extract jobs. :type section: Element :param page_no: the section part being processed :type page_no: str :return: list of jobs :rtype: List[Job] """ jobs: List[Job] = [] company = section.find('span', first=True) content = section.find('ul.jobs._list', first=True) rows = [] if not content else content.find('li > a') for row in rows: title_parts = row.text.split('\n') text = '::'.join(title_parts) text_hash = hashlib.sha256(text.encode('utf-8')) jobs.append( Job( **{ 'page_no': page_no, 'hash': text_hash.hexdigest(), 'data': { 'company_name': company.text if company else '', 'title': ' | '.join(title_parts), 'href': urljoin(self.url, row.attrs['href']), 'location': None if len(title_parts) == 1 else title_parts[1], 'deadline': None, } })) return jobs
def _parse_tweet(tweet: Element) -> dict: div = tweet.find('div.tweet', first=True) timestamp = tweet.find('a.tweet-timestamp > span._timestamp', first=True) created_at = datetime.datetime.fromtimestamp( int(timestamp.attrs['data-time-ms']) / 1000, tz=datetime.timezone.utc) return { 'id': int(div.attrs['data-tweet-id']), 'conversation_id': int(div.attrs['data-conversation-id']), 'created_at': created_at, 'user_id': int(div.attrs['data-user-id']), 'user_name': div.attrs['data-name'], 'user_screen_name': div.attrs['data-screen-name'], 'text': div.find('p.tweet-text', first=True).text, 'replies_count': _tweet_stat(div, 'reply'), 'retweets_count': _tweet_stat(div, 'retweet'), 'favorites_count': _tweet_stat(div, 'favorite'), 'mentions': div.attrs.get('data-mentions', '').split(), }
def from_html(cls, block: Element) -> Optional[Product]: for link in block.find("a"): try: sku = link.attrs["data-app-insights-track-search-doc-id"] break except KeyError: return else: return price = PRICE_RE.findall(block.text) name = [line for line in block.text.split("\n") if "!" not in line][0] return cls(sku, name, float(price[0]) if price else None)
def parse_poll_options(poll_element: Element): options_by_rank = {} for el_option in poll_element.find(OldExamSelectors.poll_option): el_option_text, = el_option.find(OldExamSelectors.poll_option_text) option_text_raw = re.search( r'[A-ZÄÖ, ]+', el_option_text.text, re.IGNORECASE).group(0) option_rank, option_text = get_option_rank(option_text_raw) el_option_vote_count, = el_option.find(OldExamSelectors.option_vote_count) option_vote_count_match = re.search("([0-9]+) ä", el_option_vote_count.text) option_vote_count = int(option_vote_count_match.group(1)) option = dict( text=option_text, vote_count=option_vote_count, rank=option_rank) options_by_rank[option_rank] = option return options_by_rank
def get_pagination_details(self, page: Element) -> List[Dict[str, str]]: """Returns paging details within a html page. :param page: html page to process to extract paging details :type page: Element :return: list of paging details :rtype: List[Dict[str, str]] """ links = [] paging = page.find('div.results-paging', first=True) spans = [] if not paging else paging.find('.pagerLink') for span in spans: links.append({ 'page_no': span.text, 'event_target': span.attrs.get('id').replace('_', '$') }) return links
def get_jobs(self, page: Element, page_no: str) -> List[Job]: """Returns job postings within a html page. :param page: html page to process to extract jobs. :type page: Element :param page_no: the page number being processes :type page_no: str :return: list of jobs :rtype: List[Job] """ jobs: List[Job] = [] table = page.find('table#tableResults', first=True) rows = [] if not table else table.find('tr') for row in rows: cells = row.find('td') if not cells: continue text = '::'.join([c.text for c in cells]) text_hash = hashlib.sha256(text.encode('utf-8')) jobs.append( Job( **{ 'page_no': page_no, 'hash': text_hash.hexdigest(), 'data': { 'title': cells[0].text.replace('\xa0', ' '), 'href': cells[0].find('a', first=True).attrs['href'], 'location': cells[1].text, 'job-family': cells[2].text, 'deadline': cells[3].text } })) return jobs
def get_player_stats(cls, element: Element, url: str, name: str) -> dict: """ 成績 """ return { 'url': url, 'name': name, 'G': element.find('td:nth-child(5)', first=True).text, 'PA': element.find('td:nth-child(6)', first=True).text, 'AB': element.find('td:nth-child(7)', first=True).text, 'R': element.find('td:nth-child(8)', first=True).text, 'H': element.find('td:nth-child(9)', first=True).text, '2B': element.find('td:nth-child(10)', first=True).text, '3B': element.find('td:nth-child(11)', first=True).text, 'HR': element.find('td:nth-child(12)', first=True).text, 'RBI': element.find('td:nth-child(13)', first=True).text, 'SB': element.find('td:nth-child(14)', first=True).text, 'CS': element.find('td:nth-child(15)', first=True).text, 'BB': element.find('td:nth-child(16)', first=True).text, 'SO': element.find('td:nth-child(17)', first=True).text, 'BA': element.find('td:nth-child(18)', first=True).text, 'OBP': element.find('td:nth-child(19)', first=True).text, 'SLG': element.find('td:nth-child(20)', first=True).text, 'OPS': element.find('td:nth-child(21)', first=True).text, 'TB': element.find('td:nth-child(22)', first=True).text, 'GIDP': element.find('td:nth-child(23)', first=True).text, 'HBP': element.find('td:nth-child(24)', first=True).text, 'SH': element.find('td:nth-child(25)', first=True).text, 'SF': element.find('td:nth-child(26)', first=True).text, 'IBB': element.find('td:nth-child(27)', first=True).text, }
def showcase_image_parse(element: Element) -> Optional[str]: showcase_image_element = element.find("img", first=True) if showcase_image_element: return showcase_image_element.attrs["src"]
def trial_product_parse(element: Element) -> Optional[bool]: if element.find(".product-flag-trial", containing="Trial Product", first=True): return True
def clearance_parse(element: Element) -> Optional[bool]: if element.find(".product-flag-clearance", containing="Clearance", first=True): return True
def rating_count_parse(element: Element) -> Optional[int]: if element.find(".rating-count", first=True): rating_count_element = element.find(".rating-count", first=True) return int(rating_count_element.find("span", first=True).text)
def stars_parse(element: Element) -> Optional[float]: stars_element = element.find(".stars", first=True) if stars_element: stars = stars_element.attrs["title"] return float(stars[:stars.find("/5")])
def best_seller_parse(element: Element) -> Optional[bool]: if element.find(".product-best-seller", containing="Best Seller", first=True): return True
def shipping_saver_parse(element: Element) -> Optional[bool]: if element.find(".shipping-saver", containing="Shipping Saver", first=True): return True
def dimensions_parse(element: Element) -> Optional[str]: dimensions_element = element.find("li", containing="Dimensions", first=True) if dimensions_element: return parse_html_text_btw(dimensions_element.text, "\n", "\n")
def in_stock_parse(element: Element) -> Optional[bool]: if element.find(".text-danger", containing="Out of Stock", first=True): return False elif element.find(".text-primary", containing="In Stock", first=True): return True
def upc_parse(element: Element) -> Optional[str]: upc_element = element.find("li", containing="UPC Code", first=True) if upc_element: return upc_element.text[upc_element.text.find(": ") + 2:]