def __call__(self, match): url = match.groups()[0] scheme, netloc, path, query, query, fragment = urlparse.urlparse(url) if (scheme or netloc or not utils.is_markdown_file(path)): # Ignore URLs unless they are a relative link to a markdown file. return 'a href="%s"' % url if self.nav: # If the site navigation has been provided, then validate # the internal hyperlink, making sure the target actually exists. target_file = self.nav.file_context.make_absolute(path) if target_file not in self.nav.source_files: source_file = self.nav.file_context.current_file msg = ('The page "%s" contained a hyperlink to "%s" which ' 'is not listed in the "pages" configuration.') assert False, msg % (source_file, target_file) path = utils.get_url_path(target_file) path = self.nav.url_context.make_relative(path) else: path = utils.get_url_path(path).lstrip('/') # Convert the .md hyperlink to a relative hyperlink to the HTML page. url = urlparse.urlunparse( (scheme, netloc, path, query, query, fragment)) return 'a href="%s"' % url
def __call__(self, match): url = match.groups()[0] scheme, netloc, path, query, query, fragment = urlparse.urlparse(url) if (scheme or netloc or not utils.is_markdown_file(path)): # Ignore URLs unless they are a relative link to a markdown file. return 'a href="%s"' % url if self.nav: # If the site navigation has been provided, then validate # the internal hyperlink, making sure the target actually exists. target_file = self.nav.file_context.make_absolute(path) if target_file not in self.nav.source_files: source_file = self.nav.file_context.current_file msg = ( 'The page "%s" contained a hyperlink to "%s" which ' 'is not listed in the "pages" configuration.' ) assert False, msg % (source_file, target_file) path = utils.get_url_path(target_file) path = self.nav.url_context.make_relative(path) else: path = utils.get_url_path(path).lstrip('/') # Convert the .md hyperlink to a relative hyperlink to the HTML page. url = urlparse.urlunparse((scheme, netloc, path, query, query, fragment)) return 'a href="%s"' % url
def test_url_path(self): expected_results = { 'index.md': '/', 'api-guide.md': '/api-guide/', 'api-guide/index.md': '/api-guide/', 'api-guide/testing.md': '/api-guide/testing/', } for file_path, expected_html_path in expected_results.items(): html_path = utils.get_url_path(file_path) self.assertEqual(html_path, expected_html_path)
def parse_list_page(self, list_url, html): soup = BeautifulSoup(html) if soup is None: print "soup is None" return None book_list = soup.find('div', {'class': 'booklist'}) if book_list is None: return None chapters = [] base_url = utils.get_url_path(list_url) for li in book_list.findAll('li'): try: url = li.span.a.get('href').encode("utf-8") title = li.text.encode('utf-8') chapter = {'url': base_url + url, "title": title, 'cid': utils.make_chapter_id(base_url + url)} chapters.append(chapter) except: pass return chapters
def _generate_site_navigation(pages_config, url_context, use_directory_urls=True): """ Returns a list of Page and Header instances that represent the top level site navigation. """ nav_items = [] pages = [] previous = None for config_line in pages_config: if isinstance(config_line, str): path = config_line title, child_title = None, None elif len(config_line) in (1, 2, 3): # Pad any items that don't exist with 'None' padded_config = (list(config_line) + [None, None])[:3] path, title, child_title = padded_config else: msg = ( "Line in 'page' config contained %d items. " "Expected 1, 2 or 3 strings." % len(config_line) ) assert False, msg if title is None and os.path.splitext(path)[0] != 'index': title = path.split('/')[0] title = os.path.splitext(title)[0] title = title.replace('-', ' ').replace('_', ' ') title = title.capitalize() if child_title is None and '/' in path: child_title = path.split('/')[1] child_title = os.path.splitext(child_title)[0] child_title = child_title.replace('-', ' ').replace('_', ' ') child_title = child_title.capitalize() url = utils.get_url_path(path, use_directory_urls) if not child_title: # New top level page. page = Page(title=title, url=url, path=path, url_context=url_context) if page.title is not None: # Page config lines that do not include a title, such as: # - ['index.md'] # Will not be added to the nav items heiarchy, although they # are included in the full list of pages, and have the # appropriate 'next'/'prev' links generated. nav_items.append(page) elif not nav_items or (nav_items[-1].title != title): # New second level page. page = Page(title=child_title, url=url, path=path, url_context=url_context) header = Header(title=title, children=[page]) nav_items.append(header) page.ancestors = [header] else: # Additional second level page. page = Page(title=child_title, url=url, path=path, url_context=url_context) header = nav_items[-1] header.children.append(page) page.ancestors = [header] # Add in previous and next information. if previous: page.previous_page = previous previous.next_page = page previous = page pages.append(page) return (nav_items, pages)
def process_entry(entry, website_pk): """Process entry data. params: - entry: parsed dataset line - website_pk: website Mongo <ObjectId> reference The function will process the entry data based on the "page type: product_detail or product_listing". A boolean value will be returned to mark the process ended successful or failed. The process can also raise exception for unrecoverable failures. """ if not entry['extract_ok']: return False extracted_data = entry['extracted_data'] if entry['page_type'] == 'product_detail': item = extracted_data['item'] brand = item['brand_name'] if brand: try: brand = models.Brand(brand=brand).ensure() except: raise props = { "brand": brand, "crawled_at": parse_datetime(entry['crawled_at']), "discount_percentage": item['discount_percentage'], "name": item['article_name'], "on_sale": item['on_sale'], "price": item['sale_price'], "product_type": item['article_type'], "properties": item['extra_props'], "sku": item['sku'], "url": entry['page_url'], "website": website_pk, # path=None, # listings=[], } # print(props) ## Clean None values props = utils.removeNoneValuesFromDict(props) # print(props) p = models.Product(**props) try: # p.save() p.ensure() except models.DuplicateKeyError as error: logger.debug("Item already exists: %s - %s - %s [%s]" % ( props.get("sku"), props.get("name"), props.get("url"), props.get("crawled_at"), )) return False except Exception as e: writeErrorFile('detail-%s' % (website_pk), entry['body']) raise e elif entry['page_type'] == 'product_listing': status = True number_of_items = extracted_data['number_of_items'] # number_of_items = len(extracted_data['items']) props = { "page_number": entry['page_number'], "page_listing_size": number_of_items, "category": entry['product_category'], "sorted_by": entry['ordering'], "url": entry['page_url'], "crawled_at": parse_datetime(entry['crawled_at']), "website": website_pk, } props = utils.removeNoneValuesFromDict(props) pl = models.ProductListingPage(**props) try: pl.ensure() pl_pk = pl.pk except models.DuplicateKeyError as error: pl = models.ProductListingPage.objects.get( dict([(k, v) for k, v in props.items() if k in ('url', 'crawled_at')])) pl_pk = pl.pk except: raise # ------------------------------------------------------------------------- # Assign Items # ------------------------------------------------------------------------- total_items = 0 not_found_products = 0 listing_added_total = 0 insufficent_data = 0 for i, item in enumerate(extracted_data['items']): # ------------------------------------------------------------------------- # Find Item first # ------------------------------------------------------------------------- detail_page_url = item.get('detail_page_url') if not detail_page_url: continue total_items = total_items + 1 # ------------------------------------------------------------------------- # Find matching Product based on detail_page_url # ------------------------------------------------------------------------- try: product = models.Product.objects.get({'path': detail_page_url}) except models.Product.DoesNotExist: logger.debug("No Product match found for %s" % (detail_page_url)) not_found_products = not_found_products + 1 continue try: li_props = { "position": i + 1, "price": item['sale_price'], "on_sale": item['on_sale'], "discount_percentage": item['discount_percentage'], "listing_props": item['listing_props'], "listing": pl_pk, } # ------------------------------------------------------------------------- # Create Listing Item # ------------------------------------------------------------------------- li = models.ProductListingItem(**li_props) except Exception as e: writeErrorFile('listing-%s' % (pl_pk), entry['body']) logger.error(e) insufficent_data = insufficent_data + 1 continue if any([True for l in product.listings if l.listing._id == pl_pk]): # print("Listing already added to product") listing_added_total = listing_added_total + 1 continue # ------------------------------------------------------------------------- # Add New Listing ot Product listings # ------------------------------------------------------------------------- product.listings.append(li) try: product.save() listing_added_total = listing_added_total + 1 except Exception as e: logger.error(e) writeErrorFile('listing-%s-%s' % (pl_pk, i), entry['body']) # ------------------------------------------------------------------------- # Debug stats # ------------------------------------------------------------------------- logger.debug("""%s: stats (ok:%s/missing:%s/nodata:%s/total:%s)""" % ( utils.get_url_path(entry['page_url']), listing_added_total, not_found_products, insufficent_data, total_items, )) return True else: logger.error("Unknown page_type") return False return True
def parse_product_listing_item(self, xitem): """Parse HTML for Listed Product Data""" item_info = {} # ------------------------------------------------------------------------- # Creating shorthands # ------------------------------------------------------------------------- xitem_attrs = getattr(xitem, 'attrs', {}) xpaths = self.item_listing_select_xpaths.get pricing = utils.convert_html_price_to_float get_xpath_text = partial(self.get_select_path_text, xitem=xitem, default=None) get_xpath_attr = partial(self.get_select_path_attr, xitem=xitem, default=None) item_info['detail_page_url'] = utils.get_url_path( get_xpath_attr(xpath=xpaths('detail_page_url'), attr='href')) try: google_data = json.loads( get_xpath_attr(xpath=xpaths('detail_page_url'), attr="data-google")) item_info['sku'] = google_data.get('id') item_info['article_name'] = google_data.get('name') item_info['brand_name'] = google_data.get('brand') item_info['sale_price'] = pricing(google_data.get('price')) except: item_info['brand_name'] = get_xpath_text( xpath=xpaths('brand_name')) item_info['sku'] = xitem_attrs.get('data-artikel') item_info['article_type'] = get_xpath_text( xpath=xpaths('article_type')) # ------------------------------------------------------------------------- # Extract Pricing Info # ------------------------------------------------------------------------- price_info = {} price_info['price_special'] = pricing( get_xpath_text(xpath=xpaths('price_special'))) price_info['price_normal'] = pricing( get_xpath_text(xpath=xpaths('price_normal'))) price_info['price_listing'] = pricing( get_xpath_text(xpath=xpaths('price_listing'))) if price_info['price_special'] and price_info[ 'price_special'] < price_info['price_listing']: price_info['price_listing'] = price_info['price_special'] if price_info['price_normal'] and price_info['price_normal'] > 0.0: price_info['price_discount'] = ( 1.0 - (price_info['price_special'] / price_info['price_normal'])) * 100.0 if item_info.get('sale_price') is None: item_info['sale_price'] = price_info['price_listing'] item_info['discount_percentage'] = utils.calcDiscountPercentage( new_price=item_info['sale_price'], old_price=price_info['price_normal']) item_info['on_sale'] = item_info['discount_percentage'] > 0.0 # ------------------------------------------------------------------------- # Extra Props # ------------------------------------------------------------------------- extra_props = {} extra_props['overview_position'] = xitem_attrs.get('data-position') extra_props['badge'] = get_xpath_text( xpath="span.badge > span.badge-label") extra_props['price_info'] = price_info item_info['listing_props'] = extra_props return item_info
def parse_product_listing_item(self, xitem): """Parse HTML for Listed Product Data""" items = [] # ------------------------------------------------------------------------- # Creating shorthands # ------------------------------------------------------------------------- xitem_attrs = getattr(xitem, 'attrs', {}) pricing = utils.convert_html_price_to_float get_xpath_text = partial(self.get_select_path_text, xitem=xitem, default=None) get_xpath_attr = partial(self.get_select_path_attr, xitem=xitem, default=None) # ------------------------------------------------------------------------- # Get variants # ------------------------------------------------------------------------- xvariants = [ x.attrs.get('data-colorid') for x in xitem.select('div.colorDivItem > ul') if 'data-colorid' in getattr(x, 'attrs', {}) ] normal_price = pricing( get_xpath_text(xpath='div.content > span.offerText')) on_sale = 'vanvoor' in xitem_attrs.get('class', []) # ------------------------------------------------------------------------- # Loop Variants and extract data per variant # ------------------------------------------------------------------------- for xvar in xvariants: item_info = { "on_sale": on_sale, } extra_props = { "normal_price": normal_price, } xlink = getattr( xitem.select_one('div.colorDivItem > ul[data-colorid="%s"] a' % (xvar)), 'attrs', {}) item_info['detail_page_url'] = re.sub( r'(../)+', '/', utils.get_url_path(xlink.get('href'))) item_info['article_name'] = get_xpath_text( xpath='div.content > div[data-colorid="%s"] > a.title' % (xvar)) item_info['sale_price'] = pricing( get_xpath_text( xpath='div.content > div[data-colorid="%s"] > span.price' % (xvar))) item_info['discount_percentage'] = utils.calcDiscountPercentage( new_price=item_info['sale_price'], old_price=normal_price, ) # ------------------------------------------------------------------------- # Extra Properties # ------------------------------------------------------------------------- extra_props['color'] = xlink.get('title') item_info['listing_props'] = extra_props items.append(dict(item_info)) return items