def process_single_url_product_page(self, pos): print('[okey] Product page ({})'.format( self.construct_full_link(pos['site_link']))) html = self.get_html_custom_cookie( self.construct_full_link(pos['site_link'])) soup = BeautifulSoup(html, 'lxml') general_info_table = soup.find('div', {'class': 'col4 product-information'}) if not general_info_table: return {} sub_table_div = general_info_table.find('ul', {'class': 'widget-list'}) if not sub_table_div: return {} li_elements = sub_table_div.find_all('li') for row in li_elements: span_elements = row.find_all('span') if not span_elements: continue elif len(span_elements) != 2: continue th = span_elements[0] td = span_elements[1] if th and td: th_text = wspex_space(th.text).lower() if th_text.startswith('вид обработки:'): pos['site_title'] = wspex_space( td.text) + ' ' + pos['site_title'] return {}
def process_single_url_product_page(self, pos): print('Product page ({})'.format( self.construct_full_link(pos['site_link']))) html = get_html(self.construct_full_link(pos['site_link'])) soup = BeautifulSoup(html, 'lxml') general_info_table = soup.find( 'table', {'class': 'xf-product-info__table xf-product-table'}) table_elements_divs = general_info_table.find_all( 'tr', {'class': 'xf-product-table__row'}) for row in table_elements_divs: th = row.find('th', {'class': 'xf-product-table__col-header'}) td = row.find('td', {'class': 'xf-product-table__col'}) if th and td: th_text = wspex_space(th.text).lower() if th_text.startswith('способ обработки' ) or th_text.startswith('вид сахара'): # print('found "{}" = {}'.format('способ обработки', wspex_space(td.text))) pos['site_title'] = wspex_space( td.text) + ' ' + pos['site_title'] elif th_text == 'вес': pos['site_title'] = pos[ 'site_title'] + ' весом ' + wspex_space(td.text) return {}
def extract_products(self, html): soup = BeautifulSoup(html, 'lxml') print(html) products_table = soup.find('table', {'class': 'OutTbl'}) price_list_divs = products_table.find_all('tr') res = [] if not price_list_divs: return [] for price_elem in price_list_divs: tds = price_elem.find_all('td') if tds[0].get('class') != 'TblShap' and wspex_space(tds[0].text): price_dict = dict() price_dict['site_title'] = tds[0].text price_dict['unitcost'] = tofloat(tds[1].text) res.append(price_dict) return res
def extract_products(self, html, page=1): pass soup = BeautifulSoup(html, 'lxml') products_div = soup.find('div', {'class': 'product_listing_container'}) if products_div is None: return False, [] pages_controller_div = soup.find('div', {'class': 'pages pageControlMenu'}) if pages_controller_div is None: flag_nextpage = False else: pages_refs = pages_controller_div.find_all('a', {'class': 'hoverover'}) max_page_index = 1 for ref in pages_refs: page_index = int(ref.text.strip()) if page_index > max_page_index: max_page_index = page_index if max_page_index > page: flag_nextpage = True else: flag_nextpage = False # if page * self.site_positions_per_page >= total_amount: # flag_nextpage = False # else: # flag_nextpage = True # price_list = products_div.find_all('div', {'class': 'product ok-theme'}) res = [] if price_list == []: return False, [] pproc = PostProcessor() for price_elem in price_list: price_dict = dict() product_unavailable_div = price_elem.find( 'div', {'class': 'product-unavailable-text'}) if product_unavailable_div is not None: continue # just skip product_name_div = price_elem.find('div', {'class': 'product_name'}) if product_name_div is not None: aref = price_elem.find('a') price_dict['site_title'] = aref.get('title') price_dict['site_link'] = aref.get('href') else: price_dict['site_title'], price_dict['site_link'] = '', '' product_price_script = price_elem.find('script', {'id': 'productData_'}) if product_price_script is not None: # print(product_price_script) script_text = product_price_script.text sr = re.search('var\s+product\s*=\s*(?P<dct>.+\});\s*$\s*', script_text, re.MULTILINE) if sr is not None: dct_str = sr.group('dct') dct = demjson.decode(dct_str) # yaml and json fails here price_dict['site_cost'] = dct['price'] weight_div = price_elem.find('div', {'class': 'product_weight'}) if weight_div: price_dict['site_unit'] = wspex_space(weight_div.text) else: quantity_div = price_elem.find('div', {'class': 'quantity_section'}) if quantity_div: price_dict['site_unit'] = '1 уп.' else: print('[okey] For product', price_dict['site_title'], ' weight not found!') continue if not price_dict['site_unit'].startswith('Цена за'): sunt = price_dict['site_unit'].split() amount, unit = tofloat(sunt[0]), sunt[1] price_dict['unitcost'] = price_dict[ 'site_cost'] * pproc.get_coeff_by_amount_and_unit( amount, unit) else: price_dict['unitcost'] = None # print(price_dict) res.append(price_dict) return flag_nextpage, res
def extract_products(self, html, page=1): pass soup = BeautifulSoup(html, 'lxml') products_div = soup.find('div', {'class': 'goods_view_box'}) if products_div is None: return False, [] pages_controller_div = soup.find('div', {'class': 'el_paginate'}) if pages_controller_div is None: flag_nextpage = False else: pages_refs = pages_controller_div.find_all('a', {'class': 'hoverover'}) max_page_index = 1 for ref in pages_refs: page_index = self.representsInt(ref.text.strip()) if page_index is not None: if page_index > max_page_index: max_page_index = page_index if max_page_index > page: flag_nextpage = True else: flag_nextpage = False price_list = products_div.find_all( 'div', {'class': 'goods_view_box-view goods_view goods_view-item'}) if price_list == []: return False, [] res = [] for price_elem in price_list: price_dict = dict() # product_unavailable_div = price_elem.find('div', {'class': 'product-unavailable-text'}) # if product_unavailable_div is not None: # continue # just skip # product_name_div = price_elem.find( 'div', {'class': 'goods_view_box-caption'}) if product_name_div is not None: aref = product_name_div.find('a') if aref is not None: price_dict['site_title'] = wspex_space(aref.text) price_dict['site_link'] = aref.get('href') else: price_dict['site_title'], price_dict['site_link'] = '', '' else: price_dict['site_title'], price_dict['site_link'] = '', '' product_price_div = price_elem.find( 'div', {'class': 'goods_price-item current'}) if product_price_div is not None: price_dict['site_cost'] = find_float_number( product_price_div.text) price_dict['site_unit'] = str( product_price_div.get('data-weight'))[1:] # print(price_dict) res.append(price_dict) return flag_nextpage, res