def _get_parts_of_speech(self, parts_of_speech: Element) -> List[str]: """ Gets the parts of speech of current word. Extracts the parts of speech from the definition because it is not properly stated. :type parts_of_speech: Element :param parts_of_speech: Element find inside div.definition p. :rtype: List[str] :return: List of Part of Speech Tags """ indices_pos_mapping = {} tokens = parts_of_speech.text.split() for part_of_speech in self.parts_of_speech: for index, token in enumerate(tokens): # Check if the current token contains part of speech like text # Also check if the current token doesn't exceed the part of speech length if token.startswith(part_of_speech ) and len(token) < len(part_of_speech) + 2: indices_pos_mapping[index] = part_of_speech # max_index means that this part of speech is the nearest to the definition max_index = max(indices_pos_mapping.keys()) # remove it from the part of speech by index mapping for now to append it as the last part of speech last_part_of_speech = indices_pos_mapping.pop(max_index) parts_of_speech = [ part_of_speech for part_of_speech in indices_pos_mapping.values() ] parts_of_speech.append(last_part_of_speech) return parts_of_speech
def parse_flat(self, html: Element) -> None: # noqa: CCR001 """Get info about flat. Get all info about flat in given html element. :param html: Given element """ try: flat_url = html.find("a", first=True).attrs.get("href") flat_id = int(re.search(r"flat/(\d+)", flat_url).group(1)) location = html.xpath(".//a[@data-name='GeoLabel']/text()") if self.domain == "ekb": location = location[1:] city, district, *location = location location = " ".join(location) price = html.xpath(".//span[@data-mark='MainPrice']/text()", first=True) price = int(price.replace("₽", "").strip().replace(" ", "")) ppm = html.xpath(".//p[@data-mark='PriceInfo']/text()", first=True) ppm = int(ppm.replace("₽/м²", "").strip().replace(" ", "")) square = round(price / ppm, 2) if not Flat.exists(id=flat_id): Flat( id=flat_id, city=city, district=district, location=location, price=price, ppm=ppm, square=square, ) commit() except Exception as exc: print(exc) rollback()
def title_parse(element: Element) -> Optional[str]: title_element = element.find(".product-title", first=True) if title_element: return title_element.text title_element = element.find("#name", first=True) if title_element: return title_element.text
def price_discount_parse(element: Element) -> Optional[float]: if element.find(".discount-green", first=True): if element.find(".discount-green", first=True): return float( element.find(".discount-green", first=True).text[1:]) elif element.find(".price", first=True): return float(element.find(".price", first=True).text[1:])
def parse_row(self, obj: Element) -> dict: """Parse object for item information.""" return { "name": get_first_element(obj, self.ITEM_NAME).text, "media_type": get_first_element(obj, self.MEDIA_AND_CATEGORY_TYPE).find("a")[0].text, "category": get_first_element(obj, self.MEDIA_AND_CATEGORY_TYPE).find("a")[1].text, "seeders": obj.find(self.SEEDERS_AND_LEACHERS)[0].text, "leachers": obj.find(self.SEEDERS_AND_LEACHERS)[1].text, "magnet_link": get_first_element(obj, self.ITEM_MAGNET_LINK).links.pop(), "file_size": get_first_element(obj, "font.detDesc").text.split(",")[1], "vip_status": get_first_element(obj, self.UPLOADER_VIP), "trusted": get_first_element(obj, self.UPLOADER_TRUSTED), "href": get_first_element(obj, "a.detLink").links.pop(), "uploader": (obj.find("font.detDesc")[0].text.split("by")[-1].strip()), "comment_count": (0 if not obj.search(self.ITEM_COMMENTS) else obj.search(self.ITEM_COMMENTS).fixed[0]), }
def save_x_percent_in_cart_parse(element: Element) -> Optional[int]: if element.find("title", containing="in Cart", first=True): save_x_percent_in_cart_element = element.find("title", containing="in Cart", first=True) return int( parse_html_text_btw(save_x_percent_in_cart_element.text, "Save ", "% in Cart"))
def loyalty_credit_x_percent_parse(element: Element) -> Optional[int]: if element.find(".slanted-container", containing="Loyalty Credit", first=True): loyalty_credit_element = element.find(".slanted-container", containing="Loyalty Credit", first=True) if loyalty_credit_element: return int( loyalty_credit_element.text[:loyalty_credit_element.text. find("% Loyalty Credit")])
def free_shipping_over_x_dollars_parse(element: Element) -> Optional[int]: free_shipping_over_element = element.find(".banner-alert", first=True) if free_shipping_over_element: if "Free Shipping\xa0for orders over" in free_shipping_over_element.text: return int(free_shipping_over_element. text[free_shipping_over_element.text.find("\n") + 2:])
def get_image_link(images: Element, article: dict, key: str) -> None: while len(images): img = images.pop() link = img.attrs.get('src') if link[-3:] == 'jpg': article.update({key: link}) break
def product_code_parse(element: Element) -> Optional[str]: product_code_element = element.find("li", containing="Product Code", first=True) if product_code_element: return product_code_element.text[product_code_element.text. find(": ") + 2:]
def package_qty_parse(element: Element) -> Optional[str]: package_qty_element = element.find("li", containing="Package Quantity", first=True) if package_qty_element: return package_qty_element.text[package_qty_element.text. find(": ") + 2:]
def yield_pron( request_html: requests_html.Element, ipa_xpath_selector: str, config: "Config", ) -> "Iterator[Pron]": for ipa_element in request_html.xpath(ipa_xpath_selector): m = re.search(config.ipa_regex, ipa_element.text) if not m: continue pron = m.group(1) # Removes parens around various segments. pron = pron.replace("(", "").replace(")", "") if _skip_pron(pron, config.skip_spaces_pron): continue try: # All pronunciation processing is done in NFD-space. pron = unicodedata.normalize("NFD", pron) pron = config.process_pron(pron) except IndexError: logging.info( "IndexError encountered processing %s during scrape of %s", pron, config.language, ) continue if pron: # The segments package inserts a # in-between spaces. if not config.skip_spaces_pron: pron = pron.replace(" #", "") yield pron
def find_element_by_tag_name(element: Element, tag_name): try: Chrome.lock.acquire() result = element.find(tag_name, first=True) finally: Chrome.lock.release() return result
def expiration_date_parse(element: Element) -> Optional[datetime.datetime]: expiration_date_element = element.find("li", containing="Expiration Date", first=True) if expiration_date_element: expiration_date = parse_html_text_btw(expiration_date_element.text, "\n?\n", "\n") return datetime.datetime.strptime(expiration_date, "%B %Y")
def parent(self) -> Union[DrissionElement, None]: """requests_html的Element打包了lxml的元素对象,从lxml元素对象读取上下级关系后再重新打包""" try: return SessionElement( Element(element=self.inner_ele.element.xpath('..')[0], url=self.inner_ele.url)) except IndexError: return None
def get_ride_info(ride: Element) -> dict: price = float( ride.find("span.num.currency-small-cents")[0].text.split("\xa0")[0]) departure_time = ride.find("div.ride-times")[0].text.split()[0] arrival_time = ride.find("div.ride-times")[0].text.split()[1] seats_str = ride.find("div.seats-notice") source = ride.find("div.departure-station-name")[0].text destination = ride.find("div.arrival-station-name")[0].text # duration = ride.find("div.duration") # departure = date + departure_time # arrival = departure + trip_length if seats_str and len(seats_str) > 0: seats_str = ride.find("div.seats-notice")[0].text matcher = re.match("(\d+)\s+\w+", seats_str) if matcher: seats_available = int(matcher.groups()[0]) else: seats_available = None return { "departure_datetime": departure_time, "arrival_datetime": arrival_time, # "2018-06-20 15:00:00", "source": source, "destinations": destination, "price": price, # in EUR - you can use https://api.skypicker.com/rates "type": "bus", # optional (bus/train) "source_id": 26323200, # optional (carrier’s id) "destination_id": 26383230, # optional (carrier’s id) "free_seats": seats_available, # optional "carrier": "Flixbus", # optional }
def movements(process: requests_html.Element) -> List[Dict]: rows = process.xpath('//tr') result = [] for row in rows: data = [] for col in row.xpath('//td'): data.append(col.text) result.append({'data': data[0], 'movimento': ''.join(data[1:])}) return result
def prevs(self, num: int = 1): """requests_html的Element打包了lxml的元素对象,从lxml元素对象读取上下级关系后再重新打包""" try: return SessionElement( Element(element=self.inner_ele.element.xpath( f'./preceding-sibling::*[{num}]')[0], url=self.inner_ele.url)) except IndexError: return None
def shipping_weight_parse(element: Element) -> Optional[float]: shipping_weight_element = element.find("li", containing="Shipping Weight", first=True) if shipping_weight_element: shipping_weight, shipping_unit = parse_html_text_btw( shipping_weight_element.text, "\n?\n", "\n").split() if shipping_unit == "lbs": return float(shipping_weight)
def parents(self, num: int = 1): """requests_html的Element打包了lxml的元素对象,从lxml元素对象读取上下级关系后再重新打包""" try: return SessionElement( Element(element=self.inner_ele.element.xpath( f'..{"/.." * (num - 1)}')[0], url=self.inner_ele.url)) except IndexError: return None
def _get_jobs(self, section: Element, page_no: str) -> List[Job]: """Returns job postings within a company section :param section: html content to proces to extract jobs. :type section: Element :param page_no: the section part being processed :type page_no: str :return: list of jobs :rtype: List[Job] """ jobs: List[Job] = [] company = section.find('span', first=True) content = section.find('ul.jobs._list', first=True) rows = [] if not content else content.find('li > a') for row in rows: title_parts = row.text.split('\n') text = '::'.join(title_parts) text_hash = hashlib.sha256(text.encode('utf-8')) jobs.append( Job( **{ 'page_no': page_no, 'hash': text_hash.hexdigest(), 'data': { 'company_name': company.text if company else '', 'title': ' | '.join(title_parts), 'href': urljoin(self.url, row.attrs['href']), 'location': None if len(title_parts) == 1 else title_parts[1], 'deadline': None, } })) return jobs
def _parse_tweet(tweet: Element) -> dict: div = tweet.find('div.tweet', first=True) timestamp = tweet.find('a.tweet-timestamp > span._timestamp', first=True) created_at = datetime.datetime.fromtimestamp( int(timestamp.attrs['data-time-ms']) / 1000, tz=datetime.timezone.utc) return { 'id': int(div.attrs['data-tweet-id']), 'conversation_id': int(div.attrs['data-conversation-id']), 'created_at': created_at, 'user_id': int(div.attrs['data-user-id']), 'user_name': div.attrs['data-name'], 'user_screen_name': div.attrs['data-screen-name'], 'text': div.find('p.tweet-text', first=True).text, 'replies_count': _tweet_stat(div, 'reply'), 'retweets_count': _tweet_stat(div, 'retweet'), 'favorites_count': _tweet_stat(div, 'favorite'), 'mentions': div.attrs.get('data-mentions', '').split(), }
def parts(process_parts: requests_html.Element) -> List[List[Dict]]: rows = process_parts.xpath('//tr') result = [] for row in rows: data = [] values = row.text.replace('\xa0', '').replace(':\n', ':').split('\n') for value in values: value = value.split(':') data.append({value[0]: value[1].strip()}) result.append(data) return result
def execute_session_find( page_or_ele: BaseParser, loc: Tuple[str, str], mode: str = 'single', show_errmsg: bool = False ) -> Union[SessionElement, List[SessionElement or str]]: """执行session模式元素的查找 \n 页面查找元素及元素查找下级元素皆使用此方法 \n :param page_or_ele: request_html的页面或元素对象 :param loc: 元素定位元组 :param mode: 'single' 或 'all',对应获取第一个或全部 :param show_errmsg: 出现异常时是否显示错误信息 :return: 返回SessionElement元素或列表 """ mode = mode or 'single' if mode not in ['single', 'all']: raise ValueError("Argument mode can only be 'single' or 'all'.") loc_by, loc_str = loc try: ele = None if loc_by == 'xpath': if 'PyQuery' in str(type(page_or_ele.element)): # 从页面查找。 ele = page_or_ele.xpath(loc_str) elif 'HtmlElement' in str(type(page_or_ele.element)): # 从元素查找。这样区分是为了能找到上级元素 try: elements = page_or_ele.element.xpath(loc_str) ele = [ Element(element=e, url=page_or_ele.url) for e in elements ] except AttributeError: ele = page_or_ele.xpath(loc_str) else: # 用css selector获取 ele = page_or_ele.find(loc_str) if mode == 'single': ele = ele[0] if ele else None return SessionElement(ele) if isinstance( ele, Element) else unescape(ele).replace('\xa0', ' ') elif mode == 'all': ele = filter(lambda x: x != '\n', ele) # 去除元素间换行符 ele = map(lambda x: unescape(x).replace('\xa0', ' ') if isinstance(x, str) else x, ele) # 替换空格 return [ SessionElement(e) if isinstance(e, Element) else e for e in ele ] except: if show_errmsg: print('Element(s) not found.', loc) raise return [] if mode == 'all' else None
def from_html(cls, block: Element) -> Optional[Product]: for link in block.find("a"): try: sku = link.attrs["data-app-insights-track-search-doc-id"] break except KeyError: return else: return price = PRICE_RE.findall(block.text) name = [line for line in block.text.split("\n") if "!" not in line][0] return cls(sku, name, float(price[0]) if price else None)
def general_data(process_general_data: requests_html.Element) -> Dict: result = {} names = [ 'Classe', 'Área', 'Assunto', 'Distribuição', 'Juiz', 'Relator', 'Valor da ação' ] for name in names: field = process_general_data.xpath( f"//tr[contains(string(), '{name}')]", first=True) if field: field = field.text field = field.replace(': ', ':\n') field = field.split(':\n') result[field[0]] = field[1] return result
def parse_poll_options(poll_element: Element): options_by_rank = {} for el_option in poll_element.find(OldExamSelectors.poll_option): el_option_text, = el_option.find(OldExamSelectors.poll_option_text) option_text_raw = re.search( r'[A-ZÄÖ, ]+', el_option_text.text, re.IGNORECASE).group(0) option_rank, option_text = get_option_rank(option_text_raw) el_option_vote_count, = el_option.find(OldExamSelectors.option_vote_count) option_vote_count_match = re.search("([0-9]+) ä", el_option_vote_count.text) option_vote_count = int(option_vote_count_match.group(1)) option = dict( text=option_text, vote_count=option_vote_count, rank=option_rank) options_by_rank[option_rank] = option return options_by_rank
def yield_pron( request_html: requests_html.Element, ipa_xpath_selector: str, config: "Config", ) -> "Iterator[Pron]": for ipa_element in request_html.xpath(ipa_xpath_selector): m = re.search(config.ipa_regex, ipa_element.text) if not m: continue pron = m.group(1) # Removes parens around various segments. pron = pron.replace("(", "").replace(")", "") if _skip_pron(pron): continue pron = config.process_pron(pron) if pron: yield pron
def get_pagination_details(self, page: Element) -> List[Dict[str, str]]: """Returns paging details within a html page. :param page: html page to process to extract paging details :type page: Element :return: list of paging details :rtype: List[Dict[str, str]] """ links = [] paging = page.find('div.results-paging', first=True) spans = [] if not paging else paging.find('.pagerLink') for span in spans: links.append({ 'page_no': span.text, 'event_target': span.attrs.get('id').replace('_', '$') }) return links
def execute_session_find( page_or_ele: BaseParser, loc: tuple, mode: str = 'single', show_errmsg: bool = False ) -> Union[SessionElement, List[SessionElement]]: """执行session模式元素的查找 \n 页面查找元素及元素查找下级元素皆使用此方法 \n :param page_or_ele: request_html的页面或元素对象 :param loc: 元素定位元组 :param mode: 'single' 或 'all',对应获取第一个或全部 :param show_errmsg: 出现异常时是否显示错误信息 :return: 返回SessionElement元素或列表 """ mode = mode or 'single' if mode not in ['single', 'all']: raise ValueError("Argument mode can only be 'single' or 'all'.") loc_by, loc_str = loc try: ele = None if loc_by == 'xpath': if 'PyQuery' in str(type(page_or_ele.element)): # 从页面查找 ele = page_or_ele.xpath(loc_str) elif 'HtmlElement' in str(type(page_or_ele.element)): # 从元素查找 elements = page_or_ele.element.xpath(loc_str) ele = [ Element(element=e, url=page_or_ele.url) for e in elements ] else: # 用css selector获取 ele = page_or_ele.find(loc_str) if mode == 'single': return SessionElement(ele[0]) if ele else None elif mode == 'all': return [SessionElement(e) for e in ele] except: if show_errmsg: print('Element(s) not found.', loc) raise return [] if mode == 'all' else None