def parse_product(self, response): soup = BeautifulSoup(response.body, 'lxml') p = Product() for element, path in self.selectors.viewitems(): node = soup.select_one(path) if not node: continue if element == 'image': p[element] = url_fix(urljoin(response.url, node['src'])) else: p[element] = text(node) if 'name' in p and 'number' in p: p['url'] = response.url p['pricing'], p['discountcode'] = get_prices(soup) soup.decompose() yield p else: # Only follow links on non-product pages soup.decompose() for link in self.link_extractor.extract_links(response): yield Request(url=link.url)
def get_discount_code(tags, depth=3): """Find a discount code by expanding the search outwards.""" for tag in tags: matches = discount_re.search(text(tag)) if matches: return matches.group(1).encode('ascii', errors='ignore') elif tag.parent and depth: code = get_discount_code([tag.parent], depth - 1) if code: return code.encode('ascii', errors='ignore')
def extract_price_info(soup): # Find price tables by searching upwards from price cells price_cells = soup.find_all(row_price) transpose = False if not price_cells: price_cells = soup.find_all(column_price) transpose = True dom_tables = [cell.find_parent('table') for cell in price_cells] # Remove duplicates while preserving order dom_tables = [table for table, _ in groupby(dom_tables)] # Convert dom tables to matrices and transpose if necessary matrices = [[[text(c) for c in r] for r in t('tr')] for t in dom_tables] if transpose: matrices = [list(izip_longest(*m)) for m in matrices] discount_code = get_discount_code(dom_tables) decompose_all(dom_tables) return (matrices, discount_code)
def quant_str(string): matches = quantity_re.search(text(string)) if matches: return re.sub('[,\+]', '', matches.group(1).encode('ascii', errors='ignore'))
def price_str(string): matches = price_re.search(text(string)) if matches: return matches.group(1).encode('ascii', errors='ignore')