コード例 #1
0
ファイル: site_handler_okey.py プロジェクト: celidos/ANE
    def process_single_url_product_page(self, pos):
        print('[okey] Product page ({})'.format(
            self.construct_full_link(pos['site_link'])))
        html = self.get_html_custom_cookie(
            self.construct_full_link(pos['site_link']))
        soup = BeautifulSoup(html, 'lxml')

        general_info_table = soup.find('div',
                                       {'class': 'col4 product-information'})
        if not general_info_table:
            return {}

        sub_table_div = general_info_table.find('ul', {'class': 'widget-list'})
        if not sub_table_div:
            return {}

        li_elements = sub_table_div.find_all('li')
        for row in li_elements:

            span_elements = row.find_all('span')
            if not span_elements:
                continue
            elif len(span_elements) != 2:
                continue

            th = span_elements[0]
            td = span_elements[1]

            if th and td:
                th_text = wspex_space(th.text).lower()
                if th_text.startswith('вид обработки:'):
                    pos['site_title'] = wspex_space(
                        td.text) + ' ' + pos['site_title']

        return {}
コード例 #2
0
    def process_single_url_product_page(self, pos):
        print('Product page ({})'.format(
            self.construct_full_link(pos['site_link'])))
        html = get_html(self.construct_full_link(pos['site_link']))
        soup = BeautifulSoup(html, 'lxml')

        general_info_table = soup.find(
            'table', {'class': 'xf-product-info__table xf-product-table'})
        table_elements_divs = general_info_table.find_all(
            'tr', {'class': 'xf-product-table__row'})

        for row in table_elements_divs:

            th = row.find('th', {'class': 'xf-product-table__col-header'})
            td = row.find('td', {'class': 'xf-product-table__col'})

            if th and td:
                th_text = wspex_space(th.text).lower()
                if th_text.startswith('способ обработки'
                                      ) or th_text.startswith('вид сахара'):
                    # print('found "{}" = {}'.format('способ обработки', wspex_space(td.text)))
                    pos['site_title'] = wspex_space(
                        td.text) + ' ' + pos['site_title']
                elif th_text == 'вес':
                    pos['site_title'] = pos[
                        'site_title'] + ' весом ' + wspex_space(td.text)

        return {}
コード例 #3
0
ファイル: site_handler_gks.py プロジェクト: celidos/ANE
    def extract_products(self, html):
        soup = BeautifulSoup(html, 'lxml')
        print(html)
        products_table = soup.find('table', {'class': 'OutTbl'})
        price_list_divs = products_table.find_all('tr')

        res = []
        if not price_list_divs:
            return []

        for price_elem in price_list_divs:
            tds = price_elem.find_all('td')
            if tds[0].get('class') != 'TblShap' and wspex_space(tds[0].text):
                price_dict = dict()
                price_dict['site_title'] = tds[0].text
                price_dict['unitcost'] = tofloat(tds[1].text)

                res.append(price_dict)

        return res
コード例 #4
0
ファイル: site_handler_okey.py プロジェクト: celidos/ANE
    def extract_products(self, html, page=1):
        pass
        soup = BeautifulSoup(html, 'lxml')

        products_div = soup.find('div', {'class': 'product_listing_container'})

        if products_div is None:
            return False, []

        pages_controller_div = soup.find('div',
                                         {'class': 'pages pageControlMenu'})
        if pages_controller_div is None:
            flag_nextpage = False
        else:
            pages_refs = pages_controller_div.find_all('a',
                                                       {'class': 'hoverover'})

            max_page_index = 1
            for ref in pages_refs:
                page_index = int(ref.text.strip())
                if page_index > max_page_index:
                    max_page_index = page_index
            if max_page_index > page:
                flag_nextpage = True
            else:
                flag_nextpage = False

        # if page * self.site_positions_per_page >= total_amount:
        #     flag_nextpage = False
        # else:
        #     flag_nextpage = True
        #
        price_list = products_div.find_all('div',
                                           {'class': 'product ok-theme'})

        res = []

        if price_list == []:
            return False, []

        pproc = PostProcessor()

        for price_elem in price_list:

            price_dict = dict()

            product_unavailable_div = price_elem.find(
                'div', {'class': 'product-unavailable-text'})
            if product_unavailable_div is not None:
                continue  # just skip

            product_name_div = price_elem.find('div',
                                               {'class': 'product_name'})
            if product_name_div is not None:
                aref = price_elem.find('a')

                price_dict['site_title'] = aref.get('title')
                price_dict['site_link'] = aref.get('href')
            else:
                price_dict['site_title'], price_dict['site_link'] = '', ''

            product_price_script = price_elem.find('script',
                                                   {'id': 'productData_'})
            if product_price_script is not None:
                # print(product_price_script)
                script_text = product_price_script.text

                sr = re.search('var\s+product\s*=\s*(?P<dct>.+\});\s*$\s*',
                               script_text, re.MULTILINE)
                if sr is not None:
                    dct_str = sr.group('dct')
                    dct = demjson.decode(dct_str)  # yaml and json fails here
                    price_dict['site_cost'] = dct['price']

            weight_div = price_elem.find('div', {'class': 'product_weight'})
            if weight_div:
                price_dict['site_unit'] = wspex_space(weight_div.text)
            else:
                quantity_div = price_elem.find('div',
                                               {'class': 'quantity_section'})
                if quantity_div:
                    price_dict['site_unit'] = '1 уп.'
                else:
                    print('[okey] For product', price_dict['site_title'],
                          ' weight not found!')
                    continue

            if not price_dict['site_unit'].startswith('Цена за'):

                sunt = price_dict['site_unit'].split()
                amount, unit = tofloat(sunt[0]), sunt[1]

                price_dict['unitcost'] = price_dict[
                    'site_cost'] * pproc.get_coeff_by_amount_and_unit(
                        amount, unit)
            else:
                price_dict['unitcost'] = None

            # print(price_dict)

            res.append(price_dict)

        return flag_nextpage, res
コード例 #5
0
    def extract_products(self, html, page=1):
        pass
        soup = BeautifulSoup(html, 'lxml')

        products_div = soup.find('div', {'class': 'goods_view_box'})

        if products_div is None:
            return False, []

        pages_controller_div = soup.find('div', {'class': 'el_paginate'})
        if pages_controller_div is None:
            flag_nextpage = False
        else:
            pages_refs = pages_controller_div.find_all('a',
                                                       {'class': 'hoverover'})

            max_page_index = 1
            for ref in pages_refs:
                page_index = self.representsInt(ref.text.strip())
                if page_index is not None:
                    if page_index > max_page_index:
                        max_page_index = page_index
            if max_page_index > page:
                flag_nextpage = True
            else:
                flag_nextpage = False

        price_list = products_div.find_all(
            'div', {'class': 'goods_view_box-view goods_view goods_view-item'})

        if price_list == []:
            return False, []

        res = []

        for price_elem in price_list:

            price_dict = dict()

            # product_unavailable_div = price_elem.find('div', {'class': 'product-unavailable-text'})
            #     if product_unavailable_div is not None:
            #         continue # just skip
            #

            product_name_div = price_elem.find(
                'div', {'class': 'goods_view_box-caption'})
            if product_name_div is not None:
                aref = product_name_div.find('a')
                if aref is not None:
                    price_dict['site_title'] = wspex_space(aref.text)
                    price_dict['site_link'] = aref.get('href')
                else:
                    price_dict['site_title'], price_dict['site_link'] = '', ''
            else:
                price_dict['site_title'], price_dict['site_link'] = '', ''

            product_price_div = price_elem.find(
                'div', {'class': 'goods_price-item current'})
            if product_price_div is not None:
                price_dict['site_cost'] = find_float_number(
                    product_price_div.text)
                price_dict['site_unit'] = str(
                    product_price_div.get('data-weight'))[1:]

            # print(price_dict)

            res.append(price_dict)

        return flag_nextpage, res