Beispiel #1
0
    def _get_parsed_product_from_url(self, url) -> Union[None, ParsedProduct]:

        page_source = self._load_page_with_TL(url)
        if page_source is None:
            # fixme - log - fatal - can't load page
            print(
                f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}"
            )
            return None

        soup = BeautifulSoup(page_source, 'html.parser')

        parsed_product = get_empty_parsed_product_dict()
        parsed_product['url'] = url

        # title
        title = remove_odd_space(
            soup.find('h1', class_='product-cart__title').text)
        sub_title = remove_odd_space(
            soup.find('a',
                      class_='product-cart__content-info-header-black').text)
        title += ' ' + sub_title
        parsed_product['title'] = title

        # price
        price = remove_ALL_spaces(
            soup.find('div',
                      class_='product-cart__content-price-actual').text)[:-1]
        parsed_product['price_new'] = price

        return parsed_product
Beispiel #2
0
    def _get_parsed_product_from_url(self, url) -> Union[None, ParsedProduct]:

        page_source = self._load_page_with_TL(url, 10.0)
        if page_source is None:
            # fixme - log - fatal - can't load page
            print(
                f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}"
            )
            return None

        soup = BeautifulSoup(page_source, 'html.parser')

        parsed_product = get_empty_parsed_product_dict()
        parsed_product['url'] = url

        # title
        title = remove_odd_space(soup.find('h1', class_='b-offer-title').text)
        parsed_product['title'] = title

        # price
        price = remove_non_digits(
            soup.find('div', class_='b-offer-box__price').text)
        parsed_product['price_new'] = price

        return parsed_product
Beispiel #3
0
    def _get_parsed_product_from_url(self, url) -> Union[None, ParsedProduct]:

        page_source = self._load_page_with_TL(url)
        if page_source is None:
            # fixme - log - fatal - can't load page
            print(f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}")
            return None

        soup = BeautifulSoup(page_source, 'html.parser')

        parsed_product = get_empty_parsed_product_dict()
        parsed_product['url'] = url

        # title
        # title
        title = remove_odd_space(soup.find('div', class_='range-revamp-header-section__title--big').text)
        sub_title = remove_odd_space(soup.find('span', 'range-revamp-header-section__description-text').text)
        title += ' ' + sub_title
        parsed_product['title'] = title

        # price
        price = remove_ALL_spaces(soup.find('span', class_='range-revamp-price__integer').text)
        parsed_product['price_new'] = price

        return parsed_product
Beispiel #4
0
    def _get_parsed_product_from_search(
            self, category_row) -> Union[None, List[ParsedProduct]]:
        if category_row['sub_type'] != 'appliances':
            return None

        parsed_product_list = []

        url = self.make_search_url(category_row['search_word'])

        print(f"{self.get_handler_name()} -> {category_row['cat_title']}")
        print(f'using url:\n{url}')

        page_source = self._load_page_with_TL(url, 10.0)
        if page_source is None:
            # fixme - log - fatal - can't load page
            print(
                f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}"
            )
            return None

        soup = BeautifulSoup(page_source, 'html.parser')

        for parsed_item in soup.find_all('div', class_='b-product-block'):
            try:
                parsed_product = get_empty_parsed_product_dict()

                # title
                title = remove_odd_space(
                    parsed_item.find('div',
                                     class_='b-product-block__name').text)
                try:
                    sub_title = remove_odd_space(
                        parsed_item.find('div',
                                         class_='b-product-block__type').text)
                    title = sub_title + ' ' + title
                except:
                    pass

                parsed_product['title'] = title

                # url
                url = parsed_item.find(
                    'a', class_='b-product-block__main-link')['href']
                url = fr'https://www.svyaznoy.ru{url}'
                parsed_product['url'] = url

                # price
                price = remove_non_digits(
                    parsed_item.find(
                        'span', class_='b-product-block__visible-price').text)
                parsed_product['price_new'] = price

                parsed_product_list.append(parsed_product)
            except:
                # FIXME log fatal
                print("can't parse svaznoy item")
                print(parsed_item)

        return parsed_product_list
Beispiel #5
0
    def _get_parsed_product_from_search(
            self, category_row) -> Union[None, List[ParsedProduct]]:
        if category_row['sub_type'] != 'appliances':
            return None

        full_parsed_product_list = []

        for page_num in range(1):
            parsed_product_list = []
            # url = self.get_search_url_for_category(category_row, page_num)
            url = self.create_general_search_url(category_row['search_word'],
                                                 page_num)

            print(f"{self.get_handler_name()} -> {category_row['cat_title']}")
            print(f'using url:\n{url}')

            page_source = self._load_page_with_TL(url)
            if page_source is None:
                # fixme - log - fatal - can't load page
                print(
                    f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}"
                )
                return None

            soup = BeautifulSoup(page_source, 'html.parser')

            for parsed_item in soup.find_all('li', {'data-dy': 'product'}):
                parsed_product = get_empty_parsed_product_dict()
                # title
                title = remove_odd_space(
                    parsed_item.find('a', {
                        'data-dy': 'title'
                    }).text)
                parsed_product['title'] = title

                # url
                url = remove_odd_space(
                    parsed_item.find('a', {'data-dy': 'title'})['href'])
                url = f"https://www.eldorado.ru{url}"
                parsed_product['url'] = url

                # price
                price_list = []
                for price_item in parsed_item.find_all('span'):
                    if hasattr(price_item,
                               'text') and price_item.text[-2:] == 'р.':
                        mb_price = float(remove_non_digits(price_item.text))
                        if 100 <= mb_price <= 100000:
                            price_list.append(mb_price)
                price = sorted(price_list)[-1]
                parsed_product['price_new'] = price

                parsed_product_list.append(parsed_product)
            full_parsed_product_list.extend(parsed_product_list)
        return full_parsed_product_list
Beispiel #6
0
    def _get_parsed_product_from_search(self, category_row) -> Union[None, List[ParsedProduct]]:
        if category_row['type'] != 'food':
            return None

        parsed_product_list = []

        url = self._create_serch_url_for_category(category_row)

        print(f"{self.get_handler_name()} -> {category_row['cat_title']}")

        page_source = self._load_page_with_TL(url, 10.0)
        if page_source is None:
            # fixme - log - fatal - can't load page
            print(f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}")
            return []

        if 'товар не представлен' in str(page_source):
            # FIXME error log
            print(f"no searched item in {self.get_handler_name()}, {category_row['cat_title']}")
            raise ValueError('no searched item in shop')

        soup = BeautifulSoup(page_source, 'html.parser')

        for page_item in soup.find_all('div', class_='sku-card-small-container'):
            parsed_product = get_empty_parsed_product_dict()

            try:
                # title
                title = page_item.find('div', class_='sku-card-small__title').text
                try:
                    # sub title
                    sub_title = page_item.find('div', class_='sku-card-small__sub-title').text
                    title += ' ' + sub_title
                except:
                    pass
                parsed_product['title'] = title

                # url
                url = page_item.find('a', class_='sku-card-small')['href']
                parsed_product['url'] = fr"https://lenta.com{url}"

                # price
                price_new = page_item.find('span', class_='sku-price__integer').text
                parsed_product['price_new'] = price_new

                parsed_product['price_old'] = None

                parsed_product_list.append(parsed_product)
            except:
                print('ERROR! in parsing page_item')
                with open(f"_{self.get_handler_name()}_{category_row['cat_title']}_{time.time()}.page_item", 'w+') as file:
                    file.write(str(page_item))

        return parsed_product_list
    def _get_parsed_product_from_search(
            self, category_row) -> Union[None, List[ParsedProduct]]:
        if category_row['type'] != 'food':
            return None

        parsed_product_list = []

        url = self._create_serch_url_for_category(category_row['search_word'])

        print(f"{self.get_handler_name()} -> {category_row['cat_title']}")
        print(f'using url:\n{url}')

        page_source = self._load_page_with_TL(url, 10.0)
        if page_source is None:
            # fixme - log - fatal - can't load page
            print(
                f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}"
            )
            return None

        soup = BeautifulSoup(page_source, 'html.parser')

        for parsed_item in soup.find_all('li', class_='xf-catalog__item'):

            if "Временно отсутствует" in str(parsed_item):
                continue

            parsed_product = get_empty_parsed_product_dict()

            # title
            title = remove_odd_space(
                parsed_item.find('a', class_='xf-product-title__link').text)
            parsed_product['title'] = title

            # url
            url = parsed_item.find('a',
                                   class_='xf-product-title__link')['href']
            url = f"https://perekrestok.ru{url}"
            parsed_product['url'] = url

            # price
            try:
                price = parsed_item.find(
                    'div', class_='xf-product-cost__old-price')['data-cost']
            except:
                price = parsed_item.find(
                    'div', class_='xf-product-cost__current')['data-cost']
            parsed_product['price_new'] = price

            parsed_product_list.append(parsed_product)

        return parsed_product_list
Beispiel #8
0
    def _get_parsed_product_from_search(
            self, category_row) -> Union[None, List[ParsedProduct]]:
        if category_row['sub_type'] != 'medicine':
            return None

        parsed_product_list = []

        url = self._create_search_url_for_category(category_row['search_word'])

        print(f"{self.get_handler_name()} -> {category_row['cat_title']}")
        print(f'using url:\n{url}')

        page_source = self._load_page_with_TL(url, 10.0)
        if page_source is None:
            # fixme - log - fatal - can't load page
            print(
                f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}"
            )
            return None

        soup = BeautifulSoup(page_source, 'html.parser')

        for parsed_item in soup.find_all('div', class_='product'):
            try:
                parsed_product = get_empty_parsed_product_dict()
                # title
                title = remove_odd_space(
                    parsed_item.find('a', class_='product__title').text)
                sub_title = remove_odd_space(
                    parsed_item.find('a', class_='product-brand__link').text)
                title += ' ' + sub_title
                parsed_product['title'] = title

                # url
                url = parsed_item.find('a', class_='product__title')['href']
                parsed_product['url'] = self._create_link_to_product(url)

                # price
                price = remove_ALL_spaces(
                    parsed_item.find(
                        'span', class_='product__active-price-number').text)
                parsed_product['price_new'] = price

                parsed_product_list.append(parsed_product)
            except:
                # FIXME log fatal
                print("can't parse rigla item")
                print(parsed_item)

        return parsed_product_list
Beispiel #9
0
    def _get_parsed_product_from_url(self, url) -> Union[None, ParsedProduct]:
        page_source = self._load_page_with_TL(url)
        if page_source is None:
            # fixme - log - fatal - can't load page
            print(
                f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}"
            )
            return None

        soup = BeautifulSoup(page_source, 'html.parser')

        parsed_product = get_empty_parsed_product_dict()
        parsed_product['url'] = url

        # title
        parsed_product['title'] = soup.find('h1', class_='main_header').text

        # price
        price_item = soup.find('span', class_='product-price')
        price_new = find_float_number(
            price_item.find('span', class_='price').text)
        parsed_product['price_new'] = price_new

        try:
            price_old = find_float_number(
                price_item.find('span', class_='crossed').text)
            parsed_product['price_old'] = price_old
        except:
            # we just have now any discount
            parsed_product['price_old'] = None
            pass

        # units
        try:
            unit_item = soup.find('ul', class_='widget-list').find_all(
                'li', class_='attributes__item')[1]
            unit_title = wspex(
                unit_item.find('div', class_='attributes__name').text)
            parsed_product['unit_title'] = unit_title

            unit_value = find_float_number(
                unit_item.find('div', class_='attributes__value').text)
            parsed_product['unit_value'] = unit_value

            parsed_product['unparsed_units'] = unit_value + " " + unit_title
        except:
            pass

        return parsed_product
Beispiel #10
0
    def _get_parsed_product_from_search(self, category_row) -> Union[None, List[ParsedProduct]]:
        if category_row['sub_type'] != 'furniture':
            return None

        parsed_product_list = []

        url = self._create_search_url_for_category(category_row['search_word'])

        print(f"{self.get_handler_name()} -> {category_row['cat_title']}")
        print(f'using url:\n{url}')

        page_source = self._load_page_with_TL(url)
        if page_source is None:
            # fixme - log - fatal - can't load page
            print(f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}")
            return None

        soup = BeautifulSoup(page_source, 'html.parser')

        for parsed_item in soup.find_all('div', class_='serp-grid__item'):
            try:
                parsed_product = get_empty_parsed_product_dict()

                # title
                title = remove_odd_space(parsed_item.find('div', 'range-revamp-header-section__title--small').text)
                sub_title = remove_odd_space(parsed_item.find('span', 'range-revamp-header-section__description-text').text)
                title += ' ' + sub_title
                parsed_product['title'] = title

                # url
                url = parsed_item.find('a')['href']
                parsed_product['url'] = url

                # price
                price = remove_ALL_spaces(parsed_item.find('span', class_='range-revamp-price__integer').text)
                parsed_product['price_new'] = price

                parsed_product_list.append(parsed_product)
            except:
                # FIXME log fatal
                print("can't parse IKEA item")
                print(parsed_item)

        return parsed_product_list
Beispiel #11
0
    def _get_parsed_product_from_url(self, url) -> Union[None, ParsedProduct]:

        page_source = self._load_page_with_TL(url, 10.0)
        if page_source is None:
            # fixme - log - fatal - can't load page
            print(f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}")
            return None

        if 'В выбранном Вами магазине данный товар не представлен' in str(page_source):
            # FIXME error log
            print('no searched item in shop')
            raise ValueError('no searched item in shop')

        soup = BeautifulSoup(page_source, 'html.parser')

        parsed_product = get_empty_parsed_product_dict()
        parsed_product['url'] = url

        # title
        title = remove_odd_space(str(soup.find('h1', class_='sku-page__title').text))
        try:
            sub_title = remove_odd_space(soup.find('div', class_='sku-page__sub-title').text)
            title += ' ' + sub_title
        except:
            pass
        parsed_product['title'] = title

        # price
        for item in soup.find_all('div', class_='sku-prices-block__item'):
            if 'обычная' in str(item).lower():
                price = remove_odd_space(item.find('span', class_='sku-price__integer').text).replace(' ', '')
                parsed_product['price_new'] = float(price.replace(',', '.'))

        # unit
        for item in soup.find_all('div', class_='sku-card-tab-params__item'):
            if 'Упаковка' in str(item):
                unit = remove_odd_space(item.find('dd', 'sku-card-tab-params__value').text)
                parsed_product['unparsed_units'] = unit

        return parsed_product
    def _get_parsed_product_from_url(self, url) -> Union[None, ParsedProduct]:

        page_source = self._load_page_with_TL(url, 10.0)
        if page_source is None:
            # fixme - log - fatal - can't load page
            print(
                f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}"
            )
            return None

        soup = BeautifulSoup(page_source, 'html.parser')

        parsed_product = get_empty_parsed_product_dict()
        parsed_product['url'] = url

        # title
        try:
            title = remove_odd_space(
                soup.find('h1', class_='xf-product-card__title').text)
        except:
            title = remove_odd_space(
                soup.find('h1', class_='xf-product-new__title').text)
        title = remove_odd_space(title)
        parsed_product['title'] = title

        # price
        price_new = soup.find('span', class_='js-price-rouble').text
        price_new = remove_odd_space(price_new)
        parsed_product['price_new'] = price_new

        try:
            price_old = soup.find('span', class_='js-old-price-rouble').text
            price_old = remove_odd_space(price_old)
        except:
            price_old = None
        parsed_product['price_old'] = price_old

        return parsed_product
Beispiel #13
0
    def _get_parsed_product_from_url(self, url) -> Union[None, ParsedProduct]:

        page_source = self._load_page_with_TL(url)
        if page_source is None:
            # fixme - log - fatal - can't load page
            print(
                f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}"
            )
            return None

        soup = BeautifulSoup(page_source, 'html.parser')

        parsed_product = get_empty_parsed_product_dict()
        parsed_product['url'] = url
        # title
        title = remove_odd_space(
            soup.find('h1', class_='catalogItemDetailHd',
                      itemprop='name').text)
        parsed_product['title'] = title

        # price
        try:
            price = remove_ALL_spaces(
                soup.find('span',
                          class_='product-box-price__old-el').text)[:-2]
        except:
            price = remove_ALL_spaces(
                soup.find('div', class_='product-box-price__active').text)[:-2]
        parsed_product['price_new'] = price
        parsed_product['price_old'] = None

        # float, value in unit of unit_title
        parsed_product['unit_value'] = 1
        # string, name of units
        parsed_product['unit_title'] = '1шт'

        return parsed_product
Beispiel #14
0
    def _get_parsed_product_from_search(
            self, categoty_row) -> Union[None, List[ParsedProduct]]:
        if categoty_row['type'] != 'food':
            return None

        parsed_product_list = []

        url = self._create_serch_url_for_category(
            str(categoty_row['search_word']).replace(' ', '+'))

        print(f"{self.get_handler_name()} -> {categoty_row['cat_title']}")
        print(f'using url:\n{url}')

        page_source = self._load_page_with_TL(url)
        if page_source is None:
            # fixme - log - fatal - can't load page
            print(
                f"can't load page, info:\n, handler : {self.get_handler_name()}\nurl: {url}"
            )
            return None

        soup = BeautifulSoup(page_source, 'html.parser')

        for product_list in soup.find_all('div',
                                          class_='product_listing_container'):
            for product_item in product_list.find_all('div', class_='product'):
                try:
                    parsed_product = get_empty_parsed_product_dict()

                    # title
                    parsed_product['title'] = product_item.find('a')['title']

                    # url
                    parsed_product[
                        'url'] = rf"https://www.okeydostavka.ru{product_item.find('a')['href']}"

                    try:
                        parsed_product['unit_value'] = find_float_number(
                            product_item.find('div',
                                              class_='product-weight').text)
                        parsed_product['unit_title'] = wspex(
                            product_item.find(
                                'div',
                                class_='product-weight').find('span').text)
                    except:
                        parsed_product['unit_value'] = None
                        parsed_product['unit_title'] = None

                    # price, also work for reduced price
                    product_item_price = product_item.find(
                        'div', class_='product-price')
                    parsed_product['price_new'] = find_float_number(
                        product_item_price.find('span', class_='price').text)
                    # price, old (not reduced)
                    try:
                        old_price = wspex(
                            product_item_price.find('span',
                                                    class_='crossed').text)
                        parsed_product['price_old'] = find_float_number(
                            old_price)
                    except:
                        # just no discount
                        parsed_product['price_old'] = None
                        pass

                    parsed_product_list.append(parsed_product)
                except:
                    print('\nOkey parser, cant parse:')
                    print(product_item, end='\n\n')

        return parsed_product_list