Esempio n. 1
0
    def process(self):
        result = None
        category_key_name = ""
        try:
            department_key = str(self.request.get("department_key"))
            category_url = str(self.request.get("category_url"))

            response = urlfetch.fetch(
                url=category_url,
                method=self.urlfetch["method"],
                deadline=self.urlfetch["deadline"],
                headers=self.urlfetch["headers"],
            )

            # Get available colours from Product
            colours = models.Product.properties()["colour"].choices

            if response.status_code == 200:
                result = response.content

                # Parse the response content into a MiniDom object
                minidom_document = html_parser.parse_html_document(result)

                # Set the Department
                department = models.Department.get_or_insert(key_name=department_key, name=department_key.title())
                department_display_name = department.name

                # Get the Category Name form the Page H1 Header
                category_name = minidom_document.getElementsByTagName("h1")[0].childNodes[0].nodeValue
                try:
                    if category_name is not None:
                        category_key_name = category_name.lower().replace(" ", "")
                except Exception, e:
                    logging.error("Get Category")
                    logging.error(e)

                    # Create the Category Model
                category = models.Category.get_or_insert(
                    key_name=category_key_name, name=category_name, department=department
                )

                # Collect all DIVs
                all_div_elements = minidom_document.getElementsByTagName("div")

                # For each DIV in the MiniDom object
                for div in all_div_elements:

                    # If the DIV id is productsList, we've found the product list container
                    if div.getAttribute("id") == "productsList":
                        # Get it's child DIVs
                        product_list_divs = div.getElementsByTagName("div")
                        # For each of those
                        for div in product_list_divs:
                            # If the DIV includes a class value of 'productCont' then we expect it to be a product
                            if "productCont" in div.getAttribute("class"):
                                sku = None
                                name = ""
                                price = None
                                product = div
                                colour = ""
                                _type = None
                                collection = None
                                url = None
                                # Get SKU code
                                try:
                                    sku = product.getAttribute("id")
                                    sku = sku.replace("id_", "")
                                except Exception, e:
                                    logging.error("Get SKU")
                                    logging.error(e)
                                    raise Exception("No product SKU found for category_url : " + category_url)

                                    # Get Collection and Name
                                try:
                                    for div in product.getElementsByTagName("div"):
                                        if "product_info" in div.getAttribute("class"):
                                            spans = div.getElementsByTagName("span")
                                            for span in spans:
                                                if "product_collection" in span.getAttribute("class"):
                                                    collection = span.childNodes[0].nodeValue
                                                else:
                                                    nodes = span.childNodes
                                                    for node in nodes:
                                                        name = name + node.nodeValue.strip() + " "

                                            name = name.strip()
                                            break
                                except Exception, e:
                                    logging.error("Error getting Collection and Name")
                                    logging.error(e)
                                    raise Exception("No Product Name found for category_url : " + category_url)

                                    # Get Product Colour
                                try:
                                    for word in name.split(" "):
                                        if word in colours:
                                            colour = word
                                            break
                                except Exception, e:
                                    logging.error("Error Getting Colour")

                                    # Get Product URL
                                try:
                                    anchors = product.getElementsByTagName("a")
                                    if len(anchors) > 0:
                                        for anchor in anchors:
                                            if "product_image_link" in anchor.getAttribute("class"):
                                                url = anchor.getAttribute("href")
                                                break
                                except Exception, e:
                                    logging.error("Error Getting Product URL")

                                    # Get Product Price
                                try:
                                    product_paragraphs = product.getElementsByTagName("p")
                                    for paragraph in product_paragraphs:
                                        if "product_price" in paragraph.getAttribute("class"):
                                            spans = paragraph.getElementsByTagName("span")
                                            for span in spans:
                                                span_class = span.getAttribute("class")

                                                if "one_price" in span_class:
                                                    price = span.childNodes[1].nodeValue
                                                    break
                                                elif "now_price" in span_class:
                                                    for price_span in span.getElementsByTagName("span"):
                                                        if "price_value" in price_span.getAttribute("class"):
                                                            price = price_span.childNodes[1].nodeValue
                                                            break
                                    if price is not None:
                                        price = price.replace(",", "")
                                        price = float(price)

                                except Exception, e:
                                    logging.error("Get Price")
                                    logging.error(e)

                                    # Create or Get the Product if it already exists
                                product = models.Product.get_or_insert(
                                    key_name=sku,
                                    name=name,
                                    price=price,
                                    colour=colour,
                                    type=_type,
                                    category=category,
                                    department=department_display_name,
                                    collection=collection,
                                    url=url,
                                )

                                # Now check for changes to Product attributes
                                change = False

                                # Create Images
                                try:
                                    for key, src in self.product_template["images"].items():
                                        path = src.replace("@id@", sku)
                                        image = models.Image(type=key, url=path)
                                        image_key = image.save()
                                        product.images.append(image_key)

                                    change = True
                                except Exception, e:
                                    logging.error("Get Medium Image")
                                    logging.error(e)
                                    raise Exception("No Product Medium Image found for category_url : " + category_url)
Esempio n. 2
0
    def process(self):
        result = None
        category_key_name = ''
        try:
            department_key = str(self.request.get('department_key'))
            category_url = str(self.request.get('category_url'))

            response = urlfetch.fetch(url=category_url,
                                      method=self.urlfetch['method'],
                                      deadline=self.urlfetch['deadline'],
                                      headers=self.urlfetch['headers'])

            # Get available colours from Product
            colours = models.Product.properties()['colour'].choices

            if response.status_code == 200:
                result = response.content

                # Parse the response content into a MiniDom object
                minidom_document = html_parser.parse_html_document(result)

                # Set the Department
                department = models.Department.get_or_insert(
                    key_name=department_key, name=department_key.title())
                department_display_name = department.name

                # Get the Category Name form the Page H1 Header
                category_name = minidom_document.getElementsByTagName(
                    'h1')[0].childNodes[0].nodeValue
                try:
                    if category_name is not None:
                        category_key_name = category_name.lower().replace(
                            ' ', '')
                except Exception, e:
                    logging.error('Get Category')
                    logging.error(e)

                # Create the Category Model
                category = models.Category.get_or_insert(
                    key_name=category_key_name,
                    name=category_name,
                    department=department)

                # Collect all DIVs
                all_div_elements = minidom_document.getElementsByTagName('div')

                # For each DIV in the MiniDom object
                for div in all_div_elements:

                    # If the DIV id is productsList, we've found the product list container
                    if div.getAttribute('id') == 'productsList':
                        # Get it's child DIVs
                        product_list_divs = div.getElementsByTagName('div')
                        # For each of those
                        for div in product_list_divs:
                            # If the DIV includes a class value of 'productCont' then we expect it to be a product
                            if 'productCont' in div.getAttribute('class'):
                                sku = None
                                name = ''
                                price = None
                                product = div
                                colour = ''
                                _type = None
                                collection = None
                                url = None
                                # Get SKU code
                                try:
                                    sku = product.getAttribute('id')
                                    sku = sku.replace('id_', '')
                                except Exception, e:
                                    logging.error('Get SKU')
                                    logging.error(e)
                                    raise Exception(
                                        'No product SKU found for category_url : '
                                        + category_url)

                                # Get Collection and Name
                                try:
                                    for div in product.getElementsByTagName(
                                            'div'):
                                        if 'product_info' in div.getAttribute(
                                                'class'):
                                            spans = div.getElementsByTagName(
                                                'span')
                                            for span in spans:
                                                if 'product_collection' in span.getAttribute(
                                                        'class'):
                                                    collection = span.childNodes[
                                                        0].nodeValue
                                                else:
                                                    nodes = span.childNodes
                                                    for node in nodes:
                                                        name = name + node.nodeValue.strip(
                                                        ) + ' '

                                            name = name.strip()
                                            break
                                except Exception, e:
                                    logging.error(
                                        'Error getting Collection and Name')
                                    logging.error(e)
                                    raise Exception(
                                        'No Product Name found for category_url : '
                                        + category_url)

                                # Get Product Colour
                                try:
                                    for word in name.split(' '):
                                        if word in colours:
                                            colour = word
                                            break
                                except Exception, e:
                                    logging.error('Error Getting Colour')

                                # Get Product URL
                                try:
                                    anchors = product.getElementsByTagName('a')
                                    if len(anchors) > 0:
                                        for anchor in anchors:
                                            if 'product_image_link' in anchor.getAttribute(
                                                    'class'):
                                                url = anchor.getAttribute(
                                                    'href')
                                                break
                                except Exception, e:
                                    logging.error('Error Getting Product URL')

                                # Get Product Price
                                try:
                                    product_paragraphs = product.getElementsByTagName(
                                        'p')
                                    for paragraph in product_paragraphs:
                                        if 'product_price' in paragraph.getAttribute(
                                                'class'):
                                            spans = paragraph.getElementsByTagName(
                                                'span')
                                            for span in spans:
                                                span_class = span.getAttribute(
                                                    'class')

                                                if 'one_price' in span_class:
                                                    price = span.childNodes[
                                                        1].nodeValue
                                                    break
                                                elif 'now_price' in span_class:
                                                    for price_span in span.getElementsByTagName(
                                                            'span'):
                                                        if 'price_value' in price_span.getAttribute(
                                                                'class'):
                                                            price = price_span.childNodes[
                                                                1].nodeValue
                                                            break
                                    if price is not None:
                                        price = price.replace(',', '')
                                        price = float(price)

                                except Exception, e:
                                    logging.error('Get Price')
                                    logging.error(e)

                                # Create or Get the Product if it already exists
                                product = models.Product.get_or_insert(
                                    key_name=sku,
                                    name=name,
                                    price=price,
                                    colour=colour,
                                    type=_type,
                                    category=category,
                                    department=department_display_name,
                                    collection=collection,
                                    url=url)

                                # Now check for changes to Product attributes
                                change = False

                                # Create Images
                                try:
                                    for key, src in self.product_template[
                                            'images'].items():
                                        path = src.replace('@id@', sku)
                                        image = models.Image(type=key,
                                                             url=path)
                                        image_key = image.save()
                                        product.images.append(image_key)

                                    change = True
                                except Exception, e:
                                    logging.error('Get Medium Image')
                                    logging.error(e)
                                    raise Exception(
                                        'No Product Medium Image found for category_url : '
                                        + category_url)
Esempio n. 3
0
    def process(self):
        try:
            product_key_name = str(self.request.get("product_key_name"))
            product_url = str(self.request.get("product_url"))

            description = None
            width = None
            height = None
            depth = None
            length = None

            response = urlfetch.fetch(
                url=product_url,
                method=self.urlfetch["method"],
                deadline=self.urlfetch["deadline"],
                headers=self.urlfetch["headers"],
            )

            if response.status_code == 200:

                # Get the Product
                product = models.Product.get_by_key_name(product_key_name)
                if product is not None:
                    # IF we are missing a Product Description or any of the dimensions, then evaluate the HTML
                    if (
                        product.description is None
                        or product.width is None
                        or product.height is None
                        or product.depth is None
                    ):
                        # Set the result content
                        result = response.content

                        # Parse the response content into a MiniDom object
                        minidom_document = html_parser.parse_html_document(result)

                        # Collect all DIV elements
                        all_div_elements = minidom_document.getElementsByTagName("div")
                        for div in all_div_elements:
                            if "main_product_container" in div.getAttribute("id"):
                                # Get Paragraphs
                                paragraphs = div.getElementsByTagName("p")

                                for paragraph in paragraphs:

                                    if len(paragraph.childNodes) > 0:
                                        # Get Description
                                        if "product_desc" in paragraph.getAttribute("id"):
                                            description = paragraph.childNodes[0].nodeValue

                                            # Get Dimensions
                                        if "dimensions" in paragraph.getAttribute("id"):
                                            dim_nodes = paragraph.childNodes
                                            for node in dim_nodes:
                                                # Find the Node with Dimensions in it
                                                # E.g. "Bed dimensions are W7.4 x H15.8 x D7cm. More dimensions are..."
                                                node_name = node.nodeName
                                                node_value = node.nodeValue
                                                if node_value is not None:
                                                    # Use the .find() method to locate the lowest index occurrence of the "cm" substring
                                                    # Based on Habitat.co.uk Website research the convention is physical space first
                                                    # then Usable space second
                                                    end_of_dimensions_index = node_value.find("cm")
                                                    if node_name == "#text":
                                                        if end_of_dimensions_index != -1:
                                                            dimensions_text = node_value[:end_of_dimensions_index]
                                                            logging.debug("dimensions_text : " + dimensions_text)
                                                            dimensions = [s.strip() for s in dimensions_text.split("x")]
                                                            for dim in dimensions:
                                                                if dim.find("W") != -1:
                                                                    width = dim.replace(dim[: dim.find("W") + 1], "")
                                                                    logging.debug("width : " + str(width))
                                                                if dim.find("H") != -1:
                                                                    height = dim.replace(dim[: dim.find("H") + 1], "")
                                                                    if "-" in height:
                                                                        height = height.split("-")[1]
                                                                    logging.debug("height : " + str(height))
                                                                if dim.find("D") != -1:
                                                                    depth = dim.replace(dim[: dim.find("D") + 1], "")
                                                                    logging.debug("depth : " + str(depth))
                                                                if dim.find("L") != -1:
                                                                    length = dim.replace(dim[: dim.find("L") + 1], "")
                                                                    logging.debug("length : " + str(length))

                                                                    # Break out of node loop
                                                            break

                                                            # Break out of all DIVs loop
                                break

                        if description is not None:
                            product.description = description
                        if width is not None:
                            product.width = float(width)
                        if height is not None:
                            product.height = float(height)
                        if depth is not None:
                            product.depth = float(depth)
                        if length is not None:
                            product.length = float(length)

                        product.save()

            else:
                logging.error("product_url : " + product_url)
                logging.error(response.status_code)
                logging.error(response)
                raise Exception("URLFetch Error for " + product_url)

        except Exception, e:
            logging.error(e)
            raise e
Esempio n. 4
0
    def process(self):
        try:
            product_key_name = str(self.request.get('product_key_name'))
            product_url = str(self.request.get('product_url'))

            description = None
            width = None
            height = None
            depth = None
            length = None

            response = urlfetch.fetch(url=product_url,
                                      method=self.urlfetch['method'],
                                      deadline=self.urlfetch['deadline'],
                                      headers=self.urlfetch['headers'])

            if response.status_code == 200:

                # Get the Product
                product = models.Product.get_by_key_name(product_key_name)
                if product is not None:
                    # IF we are missing a Product Description or any of the dimensions, then evaluate the HTML
                    if product.description is None or product.width is None or product.height is None or product.depth is None:
                        # Set the result content
                        result = response.content

                        # Parse the response content into a MiniDom object
                        minidom_document = html_parser.parse_html_document(
                            result)

                        # Collect all DIV elements
                        all_div_elements = minidom_document.getElementsByTagName(
                            'div')
                        for div in all_div_elements:
                            if 'main_product_container' in div.getAttribute(
                                    'id'):
                                # Get Paragraphs
                                paragraphs = div.getElementsByTagName('p')

                                for paragraph in paragraphs:

                                    if len(paragraph.childNodes) > 0:
                                        # Get Description
                                        if 'product_desc' in paragraph.getAttribute(
                                                'id'):
                                            description = paragraph.childNodes[
                                                0].nodeValue

                                        # Get Dimensions
                                        if 'dimensions' in paragraph.getAttribute(
                                                'id'):
                                            dim_nodes = paragraph.childNodes
                                            for node in dim_nodes:
                                                # Find the Node with Dimensions in it
                                                # E.g. "Bed dimensions are W7.4 x H15.8 x D7cm. More dimensions are..."
                                                node_name = node.nodeName
                                                node_value = node.nodeValue
                                                if node_value is not None:
                                                    # Use the .find() method to locate the lowest index occurrence of the "cm" substring
                                                    # Based on Habitat.co.uk Website research the convention is physical space first
                                                    # then Usable space second
                                                    end_of_dimensions_index = node_value.find(
                                                        'cm')
                                                    if node_name == '#text':
                                                        if end_of_dimensions_index != -1:
                                                            dimensions_text = node_value[:
                                                                                         end_of_dimensions_index]
                                                            logging.debug(
                                                                'dimensions_text : '
                                                                +
                                                                dimensions_text
                                                            )
                                                            dimensions = [
                                                                s.strip()
                                                                for s in
                                                                dimensions_text
                                                                .split('x')
                                                            ]
                                                            for dim in dimensions:
                                                                if dim.find(
                                                                        'W'
                                                                ) != -1:
                                                                    width = dim.replace(
                                                                        dim[:
                                                                            dim
                                                                            .
                                                                            find(
                                                                                'W'
                                                                            ) +
                                                                            1],
                                                                        '')
                                                                    logging.debug(
                                                                        'width : '
                                                                        +
                                                                        str(width
                                                                            ))
                                                                if dim.find(
                                                                        'H'
                                                                ) != -1:
                                                                    height = dim.replace(
                                                                        dim[:
                                                                            dim
                                                                            .
                                                                            find(
                                                                                'H'
                                                                            ) +
                                                                            1],
                                                                        '')
                                                                    if '-' in height:
                                                                        height = height.split(
                                                                            '-'
                                                                        )[1]
                                                                    logging.debug(
                                                                        'height : '
                                                                        +
                                                                        str(height
                                                                            ))
                                                                if dim.find(
                                                                        'D'
                                                                ) != -1:
                                                                    depth = dim.replace(
                                                                        dim[:
                                                                            dim
                                                                            .
                                                                            find(
                                                                                'D'
                                                                            ) +
                                                                            1],
                                                                        '')
                                                                    logging.debug(
                                                                        'depth : '
                                                                        +
                                                                        str(depth
                                                                            ))
                                                                if dim.find(
                                                                        'L'
                                                                ) != -1:
                                                                    length = dim.replace(
                                                                        dim[:
                                                                            dim
                                                                            .
                                                                            find(
                                                                                'L'
                                                                            ) +
                                                                            1],
                                                                        '')
                                                                    logging.debug(
                                                                        'length : '
                                                                        +
                                                                        str(length
                                                                            ))

                                                            # Break out of node loop
                                                            break

                                # Break out of all DIVs loop
                                break

                        if description is not None:
                            product.description = description
                        if width is not None:
                            product.width = float(width)
                        if height is not None:
                            product.height = float(height)
                        if depth is not None:
                            product.depth = float(depth)
                        if length is not None:
                            product.length = float(length)

                        product.save()

            else:
                logging.error('product_url : ' + product_url)
                logging.error(response.status_code)
                logging.error(response)
                raise Exception('URLFetch Error for ' + product_url)

        except Exception, e:
            logging.error(e)
            raise e