def _get_parts_of_speech(self, parts_of_speech: Element) -> List[str]:
        """
        Gets the parts of speech of current word.
        Extracts the parts of speech from the definition because it is not properly stated.

        :type parts_of_speech: Element
        :param parts_of_speech: Element find inside div.definition p.
        :rtype: List[str]
        :return: List of Part of Speech Tags
        """
        indices_pos_mapping = {}
        tokens = parts_of_speech.text.split()

        for part_of_speech in self.parts_of_speech:
            for index, token in enumerate(tokens):
                # Check if the current token contains part of speech like text
                # Also check if the current token doesn't exceed the part of speech length
                if token.startswith(part_of_speech
                                    ) and len(token) < len(part_of_speech) + 2:
                    indices_pos_mapping[index] = part_of_speech

        # max_index means that this part of speech is the nearest to the definition
        max_index = max(indices_pos_mapping.keys())
        # remove it from the part of speech by index mapping for now to append it as the last part of speech
        last_part_of_speech = indices_pos_mapping.pop(max_index)
        parts_of_speech = [
            part_of_speech for part_of_speech in indices_pos_mapping.values()
        ]
        parts_of_speech.append(last_part_of_speech)

        return parts_of_speech
Example #2
0
    def parse_flat(self, html: Element) -> None:  # noqa: CCR001
        """Get info about flat.

        Get all info about flat in given html element.

        :param html: Given element
        """
        try:
            flat_url = html.find("a", first=True).attrs.get("href")
            flat_id = int(re.search(r"flat/(\d+)", flat_url).group(1))
            location = html.xpath(".//a[@data-name='GeoLabel']/text()")
            if self.domain == "ekb":
                location = location[1:]
            city, district, *location = location
            location = " ".join(location)
            price = html.xpath(".//span[@data-mark='MainPrice']/text()",
                               first=True)
            price = int(price.replace("₽", "").strip().replace(" ", ""))
            ppm = html.xpath(".//p[@data-mark='PriceInfo']/text()", first=True)
            ppm = int(ppm.replace("₽/м²", "").strip().replace(" ", ""))
            square = round(price / ppm, 2)
            if not Flat.exists(id=flat_id):
                Flat(
                    id=flat_id,
                    city=city,
                    district=district,
                    location=location,
                    price=price,
                    ppm=ppm,
                    square=square,
                )
                commit()
        except Exception as exc:
            print(exc)
            rollback()
Example #3
0
 def title_parse(element: Element) -> Optional[str]:
     title_element = element.find(".product-title", first=True)
     if title_element:
         return title_element.text
     title_element = element.find("#name", first=True)
     if title_element:
         return title_element.text
Example #4
0
 def price_discount_parse(element: Element) -> Optional[float]:
     if element.find(".discount-green", first=True):
         if element.find(".discount-green", first=True):
             return float(
                 element.find(".discount-green", first=True).text[1:])
     elif element.find(".price", first=True):
         return float(element.find(".price", first=True).text[1:])
Example #5
0
 def parse_row(self, obj: Element) -> dict:
     """Parse object for item information."""
     return {
         "name":
         get_first_element(obj, self.ITEM_NAME).text,
         "media_type":
         get_first_element(obj,
                           self.MEDIA_AND_CATEGORY_TYPE).find("a")[0].text,
         "category":
         get_first_element(obj,
                           self.MEDIA_AND_CATEGORY_TYPE).find("a")[1].text,
         "seeders":
         obj.find(self.SEEDERS_AND_LEACHERS)[0].text,
         "leachers":
         obj.find(self.SEEDERS_AND_LEACHERS)[1].text,
         "magnet_link":
         get_first_element(obj, self.ITEM_MAGNET_LINK).links.pop(),
         "file_size":
         get_first_element(obj, "font.detDesc").text.split(",")[1],
         "vip_status":
         get_first_element(obj, self.UPLOADER_VIP),
         "trusted":
         get_first_element(obj, self.UPLOADER_TRUSTED),
         "href":
         get_first_element(obj, "a.detLink").links.pop(),
         "uploader":
         (obj.find("font.detDesc")[0].text.split("by")[-1].strip()),
         "comment_count": (0 if not obj.search(self.ITEM_COMMENTS) else
                           obj.search(self.ITEM_COMMENTS).fixed[0]),
     }
Example #6
0
 def save_x_percent_in_cart_parse(element: Element) -> Optional[int]:
     if element.find("title", containing="in Cart", first=True):
         save_x_percent_in_cart_element = element.find("title",
                                                       containing="in Cart",
                                                       first=True)
         return int(
             parse_html_text_btw(save_x_percent_in_cart_element.text,
                                 "Save ", "% in Cart"))
Example #7
0
 def loyalty_credit_x_percent_parse(element: Element) -> Optional[int]:
     if element.find(".slanted-container",
                     containing="Loyalty Credit",
                     first=True):
         loyalty_credit_element = element.find(".slanted-container",
                                               containing="Loyalty Credit",
                                               first=True)
         if loyalty_credit_element:
             return int(
                 loyalty_credit_element.text[:loyalty_credit_element.text.
                                             find("% Loyalty Credit")])
Example #8
0
 def free_shipping_over_x_dollars_parse(element: Element) -> Optional[int]:
     free_shipping_over_element = element.find(".banner-alert", first=True)
     if free_shipping_over_element:
         if "Free Shipping\xa0for orders over" in free_shipping_over_element.text:
             return int(free_shipping_over_element.
                        text[free_shipping_over_element.text.find("\n") +
                             2:])
Example #9
0
 def get_image_link(images: Element, article: dict, key: str) -> None:
     while len(images):
         img = images.pop()
         link = img.attrs.get('src')
         if link[-3:] == 'jpg':
             article.update({key: link})
             break
Example #10
0
 def product_code_parse(element: Element) -> Optional[str]:
     product_code_element = element.find("li",
                                         containing="Product Code",
                                         first=True)
     if product_code_element:
         return product_code_element.text[product_code_element.text.
                                          find(": ") + 2:]
Example #11
0
 def package_qty_parse(element: Element) -> Optional[str]:
     package_qty_element = element.find("li",
                                        containing="Package Quantity",
                                        first=True)
     if package_qty_element:
         return package_qty_element.text[package_qty_element.text.
                                         find(": ") + 2:]
Example #12
0
def yield_pron(
    request_html: requests_html.Element,
    ipa_xpath_selector: str,
    config: "Config",
) -> "Iterator[Pron]":
    for ipa_element in request_html.xpath(ipa_xpath_selector):
        m = re.search(config.ipa_regex, ipa_element.text)
        if not m:
            continue
        pron = m.group(1)
        # Removes parens around various segments.
        pron = pron.replace("(", "").replace(")", "")
        if _skip_pron(pron, config.skip_spaces_pron):
            continue
        try:
            # All pronunciation processing is done in NFD-space.
            pron = unicodedata.normalize("NFD", pron)
            pron = config.process_pron(pron)
        except IndexError:
            logging.info(
                "IndexError encountered processing %s during scrape of %s",
                pron,
                config.language,
            )
            continue
        if pron:
            # The segments package inserts a # in-between spaces.
            if not config.skip_spaces_pron:
                pron = pron.replace(" #", "")
            yield pron
    def find_element_by_tag_name(element: Element, tag_name):
        try:
            Chrome.lock.acquire()
            result = element.find(tag_name, first=True)
        finally:
            Chrome.lock.release()

        return result
Example #14
0
 def expiration_date_parse(element: Element) -> Optional[datetime.datetime]:
     expiration_date_element = element.find("li",
                                            containing="Expiration Date",
                                            first=True)
     if expiration_date_element:
         expiration_date = parse_html_text_btw(expiration_date_element.text,
                                               "\n?\n", "\n")
         return datetime.datetime.strptime(expiration_date, "%B %Y")
Example #15
0
 def parent(self) -> Union[DrissionElement, None]:
     """requests_html的Element打包了lxml的元素对象,从lxml元素对象读取上下级关系后再重新打包"""
     try:
         return SessionElement(
             Element(element=self.inner_ele.element.xpath('..')[0],
                     url=self.inner_ele.url))
     except IndexError:
         return None
Example #16
0
def get_ride_info(ride: Element) -> dict:
    price = float(
        ride.find("span.num.currency-small-cents")[0].text.split("\xa0")[0])
    departure_time = ride.find("div.ride-times")[0].text.split()[0]
    arrival_time = ride.find("div.ride-times")[0].text.split()[1]
    seats_str = ride.find("div.seats-notice")
    source = ride.find("div.departure-station-name")[0].text
    destination = ride.find("div.arrival-station-name")[0].text

    # duration = ride.find("div.duration")

    # departure = date + departure_time
    # arrival = departure + trip_length

    if seats_str and len(seats_str) > 0:
        seats_str = ride.find("div.seats-notice")[0].text

        matcher = re.match("(\d+)\s+\w+", seats_str)
        if matcher:
            seats_available = int(matcher.groups()[0])
    else:
        seats_available = None

    return {
        "departure_datetime": departure_time,
        "arrival_datetime": arrival_time,  # "2018-06-20 15:00:00",
        "source": source,
        "destinations": destination,
        "price": price,  # in EUR - you can use https://api.skypicker.com/rates
        "type": "bus",  # optional (bus/train)
        "source_id": 26323200,  # optional (carrier’s id)
        "destination_id": 26383230,  # optional (carrier’s id)
        "free_seats": seats_available,  # optional
        "carrier": "Flixbus",  # optional
    }
Example #17
0
def movements(process: requests_html.Element) -> List[Dict]:
    rows = process.xpath('//tr')
    result = []
    for row in rows:
        data = []
        for col in row.xpath('//td'):
            data.append(col.text)
        result.append({'data': data[0], 'movimento': ''.join(data[1:])})
    return result
Example #18
0
 def prevs(self, num: int = 1):
     """requests_html的Element打包了lxml的元素对象,从lxml元素对象读取上下级关系后再重新打包"""
     try:
         return SessionElement(
             Element(element=self.inner_ele.element.xpath(
                 f'./preceding-sibling::*[{num}]')[0],
                     url=self.inner_ele.url))
     except IndexError:
         return None
Example #19
0
 def shipping_weight_parse(element: Element) -> Optional[float]:
     shipping_weight_element = element.find("li",
                                            containing="Shipping Weight",
                                            first=True)
     if shipping_weight_element:
         shipping_weight, shipping_unit = parse_html_text_btw(
             shipping_weight_element.text, "\n?\n", "\n").split()
         if shipping_unit == "lbs":
             return float(shipping_weight)
Example #20
0
 def parents(self, num: int = 1):
     """requests_html的Element打包了lxml的元素对象,从lxml元素对象读取上下级关系后再重新打包"""
     try:
         return SessionElement(
             Element(element=self.inner_ele.element.xpath(
                 f'..{"/.." * (num - 1)}')[0],
                     url=self.inner_ele.url))
     except IndexError:
         return None
Example #21
0
    def _get_jobs(self, section: Element, page_no: str) -> List[Job]:
        """Returns job postings within a company section

        :param section: html content to proces to extract jobs.
        :type section: Element
        :param page_no: the section part being processed
        :type page_no: str
        :return: list of jobs
        :rtype: List[Job]
        """
        jobs: List[Job] = []

        company = section.find('span', first=True)
        content = section.find('ul.jobs._list', first=True)

        rows = [] if not content else content.find('li > a')
        for row in rows:
            title_parts = row.text.split('\n')

            text = '::'.join(title_parts)
            text_hash = hashlib.sha256(text.encode('utf-8'))

            jobs.append(
                Job(
                    **{
                        'page_no': page_no,
                        'hash': text_hash.hexdigest(),
                        'data': {
                            'company_name':
                            company.text if company else '',
                            'title':
                            ' | '.join(title_parts),
                            'href':
                            urljoin(self.url, row.attrs['href']),
                            'location':
                            None if len(title_parts) == 1 else title_parts[1],
                            'deadline':
                            None,
                        }
                    }))

        return jobs
Example #22
0
def _parse_tweet(tweet: Element) -> dict:
    div = tweet.find('div.tweet', first=True)
    timestamp = tweet.find('a.tweet-timestamp > span._timestamp', first=True)

    created_at = datetime.datetime.fromtimestamp(
        int(timestamp.attrs['data-time-ms']) / 1000, tz=datetime.timezone.utc)

    return {
        'id': int(div.attrs['data-tweet-id']),
        'conversation_id': int(div.attrs['data-conversation-id']),
        'created_at': created_at,
        'user_id': int(div.attrs['data-user-id']),
        'user_name': div.attrs['data-name'],
        'user_screen_name': div.attrs['data-screen-name'],
        'text': div.find('p.tweet-text', first=True).text,
        'replies_count': _tweet_stat(div, 'reply'),
        'retweets_count': _tweet_stat(div, 'retweet'),
        'favorites_count': _tweet_stat(div, 'favorite'),
        'mentions': div.attrs.get('data-mentions', '').split(),
    }
Example #23
0
def parts(process_parts: requests_html.Element) -> List[List[Dict]]:
    rows = process_parts.xpath('//tr')
    result = []
    for row in rows:
        data = []
        values = row.text.replace('\xa0', '').replace(':\n', ':').split('\n')
        for value in values:
            value = value.split(':')
            data.append({value[0]: value[1].strip()})
        result.append(data)
    return result
Example #24
0
def execute_session_find(
    page_or_ele: BaseParser,
    loc: Tuple[str, str],
    mode: str = 'single',
    show_errmsg: bool = False
) -> Union[SessionElement, List[SessionElement or str]]:
    """执行session模式元素的查找                           \n
    页面查找元素及元素查找下级元素皆使用此方法                \n
    :param page_or_ele: request_html的页面或元素对象
    :param loc: 元素定位元组
    :param mode: 'single' 或 'all',对应获取第一个或全部
    :param show_errmsg: 出现异常时是否显示错误信息
    :return: 返回SessionElement元素或列表
    """
    mode = mode or 'single'
    if mode not in ['single', 'all']:
        raise ValueError("Argument mode can only be 'single' or 'all'.")
    loc_by, loc_str = loc
    try:
        ele = None
        if loc_by == 'xpath':
            if 'PyQuery' in str(type(page_or_ele.element)):
                # 从页面查找。
                ele = page_or_ele.xpath(loc_str)
            elif 'HtmlElement' in str(type(page_or_ele.element)):
                # 从元素查找。这样区分是为了能找到上级元素
                try:
                    elements = page_or_ele.element.xpath(loc_str)
                    ele = [
                        Element(element=e, url=page_or_ele.url)
                        for e in elements
                    ]
                except AttributeError:
                    ele = page_or_ele.xpath(loc_str)
        else:  # 用css selector获取
            ele = page_or_ele.find(loc_str)

        if mode == 'single':
            ele = ele[0] if ele else None
            return SessionElement(ele) if isinstance(
                ele, Element) else unescape(ele).replace('\xa0', ' ')
        elif mode == 'all':
            ele = filter(lambda x: x != '\n', ele)  # 去除元素间换行符
            ele = map(lambda x: unescape(x).replace('\xa0', ' ')
                      if isinstance(x, str) else x, ele)  # 替换空格
            return [
                SessionElement(e) if isinstance(e, Element) else e for e in ele
            ]
    except:
        if show_errmsg:
            print('Element(s) not found.', loc)
            raise
        return [] if mode == 'all' else None
    def from_html(cls, block: Element) -> Optional[Product]:
        for link in block.find("a"):
            try:
                sku = link.attrs["data-app-insights-track-search-doc-id"]
                break
            except KeyError:
                return
        else:
            return
        price = PRICE_RE.findall(block.text)
        name = [line for line in block.text.split("\n") if "!" not in line][0]

        return cls(sku, name, float(price[0]) if price else None)
Example #26
0
def general_data(process_general_data: requests_html.Element) -> Dict:
    result = {}
    names = [
        'Classe', 'Área', 'Assunto', 'Distribuição', 'Juiz', 'Relator',
        'Valor da ação'
    ]
    for name in names:
        field = process_general_data.xpath(
            f"//tr[contains(string(), '{name}')]", first=True)
        if field:
            field = field.text
            field = field.replace(': ', ':\n')
            field = field.split(':\n')
            result[field[0]] = field[1]
    return result
Example #27
0
 def parse_poll_options(poll_element: Element):
     options_by_rank = {}
     for el_option in poll_element.find(OldExamSelectors.poll_option):
         el_option_text, = el_option.find(OldExamSelectors.poll_option_text)
         option_text_raw = re.search(
             r'[A-ZÄÖ, ]+', el_option_text.text, re.IGNORECASE).group(0)
         option_rank, option_text = get_option_rank(option_text_raw)
         el_option_vote_count, = el_option.find(OldExamSelectors.option_vote_count)
         option_vote_count_match = re.search("([0-9]+) ä", el_option_vote_count.text)
         option_vote_count = int(option_vote_count_match.group(1))
         option = dict(
             text=option_text,
             vote_count=option_vote_count,
             rank=option_rank)
         options_by_rank[option_rank] = option
     return options_by_rank
Example #28
0
def yield_pron(
    request_html: requests_html.Element,
    ipa_xpath_selector: str,
    config: "Config",
) -> "Iterator[Pron]":
    for ipa_element in request_html.xpath(ipa_xpath_selector):
        m = re.search(config.ipa_regex, ipa_element.text)
        if not m:
            continue
        pron = m.group(1)
        # Removes parens around various segments.
        pron = pron.replace("(", "").replace(")", "")
        if _skip_pron(pron):
            continue
        pron = config.process_pron(pron)
        if pron:
            yield pron
Example #29
0
    def get_pagination_details(self, page: Element) -> List[Dict[str, str]]:
        """Returns paging details within a html page.

        :param page: html page to process to extract paging details
        :type page: Element
        :return: list of paging details
        :rtype: List[Dict[str, str]]
        """
        links = []
        paging = page.find('div.results-paging', first=True)

        spans = [] if not paging else paging.find('.pagerLink')
        for span in spans:
            links.append({
                'page_no': span.text,
                'event_target': span.attrs.get('id').replace('_', '$')
            })

        return links
Example #30
0
def execute_session_find(
        page_or_ele: BaseParser,
        loc: tuple,
        mode: str = 'single',
        show_errmsg: bool = False
) -> Union[SessionElement, List[SessionElement]]:
    """执行session模式元素的查找                           \n
    页面查找元素及元素查找下级元素皆使用此方法                \n
    :param page_or_ele: request_html的页面或元素对象
    :param loc: 元素定位元组
    :param mode: 'single' 或 'all',对应获取第一个或全部
    :param show_errmsg: 出现异常时是否显示错误信息
    :return: 返回SessionElement元素或列表
    """
    mode = mode or 'single'
    if mode not in ['single', 'all']:
        raise ValueError("Argument mode can only be 'single' or 'all'.")
    loc_by, loc_str = loc
    try:
        ele = None
        if loc_by == 'xpath':
            if 'PyQuery' in str(type(page_or_ele.element)):  # 从页面查找
                ele = page_or_ele.xpath(loc_str)
            elif 'HtmlElement' in str(type(page_or_ele.element)):  # 从元素查找
                elements = page_or_ele.element.xpath(loc_str)
                ele = [
                    Element(element=e, url=page_or_ele.url) for e in elements
                ]
        else:  # 用css selector获取
            ele = page_or_ele.find(loc_str)

        if mode == 'single':
            return SessionElement(ele[0]) if ele else None
        elif mode == 'all':
            return [SessionElement(e) for e in ele]
    except:
        if show_errmsg:
            print('Element(s) not found.', loc)
            raise
        return [] if mode == 'all' else None