Beispiel #1
0
    def __call__(self, match):
        url = match.groups()[0]
        scheme, netloc, path, query, query, fragment = urlparse.urlparse(url)

        if (scheme or netloc or not utils.is_markdown_file(path)):
            # Ignore URLs unless they are a relative link to a markdown file.
            return 'a href="%s"' % url

        if self.nav:
            # If the site navigation has been provided, then validate
            # the internal hyperlink, making sure the target actually exists.
            target_file = self.nav.file_context.make_absolute(path)
            if target_file not in self.nav.source_files:
                source_file = self.nav.file_context.current_file
                msg = ('The page "%s" contained a hyperlink to "%s" which '
                       'is not listed in the "pages" configuration.')
                assert False, msg % (source_file, target_file)
            path = utils.get_url_path(target_file)
            path = self.nav.url_context.make_relative(path)
        else:
            path = utils.get_url_path(path).lstrip('/')

        # Convert the .md hyperlink to a relative hyperlink to the HTML page.
        url = urlparse.urlunparse(
            (scheme, netloc, path, query, query, fragment))
        return 'a href="%s"' % url
Beispiel #2
0
    def __call__(self, match):
        url = match.groups()[0]
        scheme, netloc, path, query, query, fragment = urlparse.urlparse(url)

        if (scheme or netloc or not utils.is_markdown_file(path)):
            # Ignore URLs unless they are a relative link to a markdown file.
            return 'a href="%s"' % url

        if self.nav:
            # If the site navigation has been provided, then validate
            # the internal hyperlink, making sure the target actually exists.
            target_file = self.nav.file_context.make_absolute(path)
            if target_file not in self.nav.source_files:
                source_file = self.nav.file_context.current_file
                msg = (
                    'The page "%s" contained a hyperlink to "%s" which '
                    'is not listed in the "pages" configuration.'
                )
                assert False, msg % (source_file, target_file)
            path = utils.get_url_path(target_file)
            path = self.nav.url_context.make_relative(path)
        else:
            path = utils.get_url_path(path).lstrip('/')

        # Convert the .md hyperlink to a relative hyperlink to the HTML page.
        url = urlparse.urlunparse((scheme, netloc, path, query, query, fragment))
        return 'a href="%s"' % url
Beispiel #3
0
 def test_url_path(self):
     expected_results = {
         'index.md': '/',
         'api-guide.md': '/api-guide/',
         'api-guide/index.md': '/api-guide/',
         'api-guide/testing.md': '/api-guide/testing/',
     }
     for file_path, expected_html_path in expected_results.items():
         html_path = utils.get_url_path(file_path)
         self.assertEqual(html_path, expected_html_path)
Beispiel #4
0
 def test_url_path(self):
     expected_results = {
         'index.md': '/',
         'api-guide.md': '/api-guide/',
         'api-guide/index.md': '/api-guide/',
         'api-guide/testing.md': '/api-guide/testing/',
     }
     for file_path, expected_html_path in expected_results.items():
         html_path = utils.get_url_path(file_path)
         self.assertEqual(html_path, expected_html_path)
Beispiel #5
0
 def parse_list_page(self, list_url, html):
     soup = BeautifulSoup(html)
     if soup is None:
         print "soup is None"
         return None
     book_list = soup.find('div', {'class': 'booklist'})
     if book_list is None:
         return None
     chapters = []
     base_url = utils.get_url_path(list_url)
     for li in book_list.findAll('li'):
         try:
             url = li.span.a.get('href').encode("utf-8")
             title = li.text.encode('utf-8')
             chapter = {'url': base_url + url, "title": title,
                        'cid': utils.make_chapter_id(base_url + url)}
             chapters.append(chapter)
         except:
             pass
     return  chapters
Beispiel #6
0
def _generate_site_navigation(pages_config, url_context, use_directory_urls=True):
    """
    Returns a list of Page and Header instances that represent the
    top level site navigation.
    """
    nav_items = []
    pages = []
    previous = None

    for config_line in pages_config:
        if isinstance(config_line, str):
            path = config_line
            title, child_title = None, None
        elif len(config_line) in (1, 2, 3):
            # Pad any items that don't exist with 'None'
            padded_config = (list(config_line) + [None, None])[:3]
            path, title, child_title = padded_config
        else:
            msg = (
                "Line in 'page' config contained %d items.  "
                "Expected 1, 2 or 3 strings." % len(config_line)
            )
            assert False, msg

        if title is None and os.path.splitext(path)[0] != 'index':
            title = path.split('/')[0]
            title = os.path.splitext(title)[0]
            title = title.replace('-', ' ').replace('_', ' ')
            title = title.capitalize()
        if child_title is None and '/' in path:
            child_title = path.split('/')[1]
            child_title = os.path.splitext(child_title)[0]
            child_title = child_title.replace('-', ' ').replace('_', ' ')
            child_title = child_title.capitalize()

        url = utils.get_url_path(path, use_directory_urls)

        if not child_title:
            # New top level page.
            page = Page(title=title, url=url, path=path, url_context=url_context)
            if page.title is not None:
                # Page config lines that do not include a title, such as:
                #    - ['index.md']
                # Will not be added to the nav items heiarchy, although they
                # are included in the full list of pages, and have the
                # appropriate 'next'/'prev' links generated.
                nav_items.append(page)
        elif not nav_items or (nav_items[-1].title != title):
            # New second level page.
            page = Page(title=child_title, url=url, path=path, url_context=url_context)
            header = Header(title=title, children=[page])
            nav_items.append(header)
            page.ancestors = [header]
        else:
            # Additional second level page.
            page = Page(title=child_title, url=url, path=path, url_context=url_context)
            header = nav_items[-1]
            header.children.append(page)
            page.ancestors = [header]

        # Add in previous and next information.
        if previous:
            page.previous_page = previous
            previous.next_page = page
        previous = page

        pages.append(page)

    return (nav_items, pages)
Beispiel #7
0
def process_entry(entry, website_pk):
    """Process entry data.

    params:
        - entry: parsed dataset line
        - website_pk: website Mongo <ObjectId> reference

    The function will process the entry data based on the "page type: product_detail or product_listing".

    A boolean value will be returned to mark the process ended successful or failed.

    The process can also raise exception for unrecoverable failures.

    """
    if not entry['extract_ok']:
        return False

    extracted_data = entry['extracted_data']

    if entry['page_type'] == 'product_detail':
        item = extracted_data['item']
        brand = item['brand_name']
        if brand:
            try:
                brand = models.Brand(brand=brand).ensure()
            except:
                raise

        props = {
            "brand": brand,
            "crawled_at": parse_datetime(entry['crawled_at']),
            "discount_percentage": item['discount_percentage'],
            "name": item['article_name'],
            "on_sale": item['on_sale'],
            "price": item['sale_price'],
            "product_type": item['article_type'],
            "properties": item['extra_props'],
            "sku": item['sku'],
            "url": entry['page_url'],
            "website": website_pk,
            # path=None,
            # listings=[],
        }
        # print(props)
        ## Clean None values
        props = utils.removeNoneValuesFromDict(props)
        # print(props)
        p = models.Product(**props)
        try:
            # p.save()
            p.ensure()
        except models.DuplicateKeyError as error:
            logger.debug("Item already exists: %s - %s - %s [%s]" % (
                props.get("sku"),
                props.get("name"),
                props.get("url"),
                props.get("crawled_at"),
            ))
            return False
        except Exception as e:
            writeErrorFile('detail-%s' % (website_pk), entry['body'])
            raise e

    elif entry['page_type'] == 'product_listing':
        status = True

        number_of_items = extracted_data['number_of_items']
        # number_of_items = len(extracted_data['items'])

        props = {
            "page_number": entry['page_number'],
            "page_listing_size": number_of_items,
            "category": entry['product_category'],
            "sorted_by": entry['ordering'],
            "url": entry['page_url'],
            "crawled_at": parse_datetime(entry['crawled_at']),
            "website": website_pk,
        }

        props = utils.removeNoneValuesFromDict(props)

        pl = models.ProductListingPage(**props)
        try:
            pl.ensure()
            pl_pk = pl.pk
        except models.DuplicateKeyError as error:
            pl = models.ProductListingPage.objects.get(
                dict([(k, v) for k, v in props.items()
                      if k in ('url', 'crawled_at')]))
            pl_pk = pl.pk
        except:
            raise

        # -------------------------------------------------------------------------
        # Assign Items
        # -------------------------------------------------------------------------
        total_items = 0
        not_found_products = 0
        listing_added_total = 0
        insufficent_data = 0
        for i, item in enumerate(extracted_data['items']):
            # -------------------------------------------------------------------------
            # Find Item first
            # -------------------------------------------------------------------------
            detail_page_url = item.get('detail_page_url')
            if not detail_page_url:
                continue

            total_items = total_items + 1
            # -------------------------------------------------------------------------
            # Find matching Product based on detail_page_url
            # -------------------------------------------------------------------------
            try:
                product = models.Product.objects.get({'path': detail_page_url})
            except models.Product.DoesNotExist:
                logger.debug("No Product match found for %s" %
                             (detail_page_url))
                not_found_products = not_found_products + 1
                continue

            try:
                li_props = {
                    "position": i + 1,
                    "price": item['sale_price'],
                    "on_sale": item['on_sale'],
                    "discount_percentage": item['discount_percentage'],
                    "listing_props": item['listing_props'],
                    "listing": pl_pk,
                }
                # -------------------------------------------------------------------------
                # Create Listing Item
                # -------------------------------------------------------------------------
                li = models.ProductListingItem(**li_props)
            except Exception as e:
                writeErrorFile('listing-%s' % (pl_pk), entry['body'])
                logger.error(e)
                insufficent_data = insufficent_data + 1
                continue

            if any([True for l in product.listings if l.listing._id == pl_pk]):
                # print("Listing already added to product")
                listing_added_total = listing_added_total + 1
                continue

            # -------------------------------------------------------------------------
            # Add New Listing ot Product listings
            # -------------------------------------------------------------------------
            product.listings.append(li)

            try:
                product.save()
                listing_added_total = listing_added_total + 1
            except Exception as e:
                logger.error(e)

                writeErrorFile('listing-%s-%s' % (pl_pk, i), entry['body'])

        # -------------------------------------------------------------------------
        # Debug stats
        # -------------------------------------------------------------------------
        logger.debug("""%s: stats (ok:%s/missing:%s/nodata:%s/total:%s)""" % (
            utils.get_url_path(entry['page_url']),
            listing_added_total,
            not_found_products,
            insufficent_data,
            total_items,
        ))

        return True
    else:
        logger.error("Unknown page_type")
        return False

    return True
Beispiel #8
0
    def parse_product_listing_item(self, xitem):
        """Parse HTML for Listed Product Data"""
        item_info = {}

        # -------------------------------------------------------------------------
        # Creating shorthands
        # -------------------------------------------------------------------------
        xitem_attrs = getattr(xitem, 'attrs', {})
        xpaths = self.item_listing_select_xpaths.get
        pricing = utils.convert_html_price_to_float
        get_xpath_text = partial(self.get_select_path_text,
                                 xitem=xitem,
                                 default=None)
        get_xpath_attr = partial(self.get_select_path_attr,
                                 xitem=xitem,
                                 default=None)

        item_info['detail_page_url'] = utils.get_url_path(
            get_xpath_attr(xpath=xpaths('detail_page_url'), attr='href'))

        try:
            google_data = json.loads(
                get_xpath_attr(xpath=xpaths('detail_page_url'),
                               attr="data-google"))
            item_info['sku'] = google_data.get('id')
            item_info['article_name'] = google_data.get('name')
            item_info['brand_name'] = google_data.get('brand')
            item_info['sale_price'] = pricing(google_data.get('price'))
        except:
            item_info['brand_name'] = get_xpath_text(
                xpath=xpaths('brand_name'))
            item_info['sku'] = xitem_attrs.get('data-artikel')

        item_info['article_type'] = get_xpath_text(
            xpath=xpaths('article_type'))

        # -------------------------------------------------------------------------
        # Extract Pricing Info
        # -------------------------------------------------------------------------
        price_info = {}
        price_info['price_special'] = pricing(
            get_xpath_text(xpath=xpaths('price_special')))
        price_info['price_normal'] = pricing(
            get_xpath_text(xpath=xpaths('price_normal')))
        price_info['price_listing'] = pricing(
            get_xpath_text(xpath=xpaths('price_listing')))

        if price_info['price_special'] and price_info[
                'price_special'] < price_info['price_listing']:
            price_info['price_listing'] = price_info['price_special']

            if price_info['price_normal'] and price_info['price_normal'] > 0.0:
                price_info['price_discount'] = (
                    1.0 - (price_info['price_special'] /
                           price_info['price_normal'])) * 100.0

        if item_info.get('sale_price') is None:
            item_info['sale_price'] = price_info['price_listing']

        item_info['discount_percentage'] = utils.calcDiscountPercentage(
            new_price=item_info['sale_price'],
            old_price=price_info['price_normal'])
        item_info['on_sale'] = item_info['discount_percentage'] > 0.0

        # -------------------------------------------------------------------------
        # Extra Props
        # -------------------------------------------------------------------------
        extra_props = {}
        extra_props['overview_position'] = xitem_attrs.get('data-position')
        extra_props['badge'] = get_xpath_text(
            xpath="span.badge > span.badge-label")
        extra_props['price_info'] = price_info

        item_info['listing_props'] = extra_props

        return item_info
Beispiel #9
0
    def parse_product_listing_item(self, xitem):
        """Parse HTML for Listed Product Data"""
        items = []

        # -------------------------------------------------------------------------
        # Creating shorthands
        # -------------------------------------------------------------------------
        xitem_attrs = getattr(xitem, 'attrs', {})
        pricing = utils.convert_html_price_to_float
        get_xpath_text = partial(self.get_select_path_text,
                                 xitem=xitem,
                                 default=None)
        get_xpath_attr = partial(self.get_select_path_attr,
                                 xitem=xitem,
                                 default=None)

        # -------------------------------------------------------------------------
        # Get variants
        # -------------------------------------------------------------------------
        xvariants = [
            x.attrs.get('data-colorid')
            for x in xitem.select('div.colorDivItem > ul')
            if 'data-colorid' in getattr(x, 'attrs', {})
        ]

        normal_price = pricing(
            get_xpath_text(xpath='div.content > span.offerText'))
        on_sale = 'vanvoor' in xitem_attrs.get('class', [])

        # -------------------------------------------------------------------------
        # Loop Variants and extract data per variant
        # -------------------------------------------------------------------------
        for xvar in xvariants:
            item_info = {
                "on_sale": on_sale,
            }
            extra_props = {
                "normal_price": normal_price,
            }

            xlink = getattr(
                xitem.select_one('div.colorDivItem > ul[data-colorid="%s"] a' %
                                 (xvar)), 'attrs', {})
            item_info['detail_page_url'] = re.sub(
                r'(../)+', '/', utils.get_url_path(xlink.get('href')))
            item_info['article_name'] = get_xpath_text(
                xpath='div.content > div[data-colorid="%s"] > a.title' %
                (xvar))
            item_info['sale_price'] = pricing(
                get_xpath_text(
                    xpath='div.content > div[data-colorid="%s"] > span.price' %
                    (xvar)))

            item_info['discount_percentage'] = utils.calcDiscountPercentage(
                new_price=item_info['sale_price'],
                old_price=normal_price,
            )

            # -------------------------------------------------------------------------
            # Extra Properties
            # -------------------------------------------------------------------------
            extra_props['color'] = xlink.get('title')
            item_info['listing_props'] = extra_props

            items.append(dict(item_info))

        return items