Python VariantsXml.write_xml Exemples, modules.zmags_xml.VariantsXml.write_xml Python Exemples

Exemple #1

0

Afficher le fichier

class ExpressSpider(CrawlSpider):
    name = "express"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    temp_msg = ""
    handle_httpstatus_list = [404]
    counter = 0

    def __init__(self, *a, **kw):
        super(ExpressSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = CommonTerminal(sys.argv, self.name)
        self.log = Logger()
        self.d = terminal.get_arguments()
        self.xml = VariantsXml()
        self.exc = ZmagsException(5)
        shops = CreateShops(self.d['file'], self.xml)
        try:
            shops.get()
        except IndexError:
            print "This sheet has no shop look or line"
        self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.start_urls = self.url_list[:2]
        self.total = len(self.start_urls)

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = ExpressItem()
        index = self.url_list.index(response.url)
        self.url_list[index] = self.counter
        flag = 0
        shop_look = 0
        # main try that catches all unhandled errors
        try:
            hxs = HtmlXPathSelector(response)
            if response.url != "http://www.zmags.com/":
                error_404 = hxs.select('//img[@alt="404 Error Page Not Found"]').extract()
                flag = 1
                if not error_404:
                    flag = 1
                    available = hxs.select('//span[@class="glo-tex-error"]/text()').extract()
                    page = " ".join(hxs.select('//html').extract())
                    #part for creating main product in xml
                    id = self.get_product_id(hxs)[0]
                    if id != self.id_list[index]:
                        msg = "\nNot equal, id in sheet {0}, on site {1}".format(self.id_list[index], id)
                        self.temp_msg += msg
                    item['product_id'] = [id]
                    item['name'] = self.get_name(hxs)
                    item['description'], item['promo_text'] = self.get_basic_info(hxs)
                    item['master_price'], item['discount_price'] = self.get_product_prices(hxs)
                    item['shop_look'] = ['False']
                    item['normal'] = ['True']
                    item['shop_line'] = ['False']
                    item['in_stock'] = ["NOT_IN_STOCK"]
                    if available[0] != "This item is no longer available for purchase.":
                        item['category_id'], item['subcategory_id'] = self.get_categories(hxs)
                        item['add_to_cart_id'] = self.get_add_to_cart_id(hxs)
                        color_names, urls, swatch_image_names, jsons = self.get_swatch_images(hxs)
                        #urls = basic.cdata_field(self.map_url_to_server(urls, id, True))
                        item['color_image_url'] = self.create_color_json(urls, color_names)
                        item['in_stock'] = ["IN_STOCK"]
                        item['product_page'] = [response.url]
                        self.xml.create_xml(item)
                        product_images, images_grouped = self.parse_jsons(jsons, color_names)
                        ids, sizes, prices = self.get_variants(page)
                        # calling function that will handle creating all child products
                        self.create_child_products(id, ids, sizes, prices, images_grouped)
                        item['image_urls'] = urls + product_images
                        if self.shop_look_list[index]:
                            self.parse_for_shop_look(hxs, self.shop_look_list[index],
                                                     id, page, images_grouped, response.url, index)
                        if self.shop_line_list[index]:
                            self.parse_for_shop_look(hxs, self.shop_line_list[index],
                                                     id, page, images_grouped, response.url, index)
                    else:
                        self.xml.create_xml(item)
                        self.exc.code_handler(102, response.url)

                else:
                    self.exc.code_handler(104, response.url)
            else:
                basic.not_provided()
                self.exc.code_handler(101, response.url)
            if not flag:
                item['product_id'] = [self.id_list[index]]
                item['in_stock'] = ["NOT_AVAILABLE"]
                item['name'] = ["not available"]
                self.xml.create_xml(item)
        except StandardError:
            self.exc.code_handler(100, response.url)
        #if it's last product write xml and run end_operations
        return item

    def parse_for_shop_look(self, hxs, id, product_id, page, images_grouped, product_url, index):
        """Special parse function for shop looks and lines.
        It gets same info stored in different format, mostly json and reference
        to master product id that is actually shop look/line id.
        TO DO: see if there is need to specially handle the case
        for not available"""
        item = ExpressItem()
        item['master_product_id'] = [id]
        item['product_id'] = [id + "_" + product_id]
        if self.ordered:
            item['order_index'] = [self.order_list[index]]
        item['style'] = [product_id]
        item['product_page'] = [product_url]
        item['category_id'], item['subcategory_id'] = self.get_categories(hxs)
        item['add_to_cart_id'] = self.get_add_to_cart_id(hxs)
        # below is part fot creating swatch images and images json
        color_names, urls, swatch_image_names, jsons = self.get_swatch_images(hxs)
        i = 0
        colors = []
        for k in color_names:
            d = {'name': k, 'swatch_url': urls[i], 'image_url': self.get_absolute_url(images_grouped[k])}
            i += 1
            colors.append(simplejson.dumps(d))
        item['colors'] = basic.cdata_field(colors)
        item['price'], item['discount_price'] = self.get_product_prices(hxs)
        item['description'], item['promo_text'] = self.get_basic_info(hxs)
        item['name'] = self.get_name(hxs)
        # below is part for creating variants json
        ids, sizes, prices = self.get_variants(page)
        variants = []
        for k in ids:
            d = {'color': k, 'prices': prices[k], 'ids': ids[k]}
            try:
                d['sizes'] = sizes[k]
            except StandardError:
                print "This product has no sizes"
            variants.append(simplejson.dumps(d))
        item['variants'] = basic.cdata_field(variants)
        self.xml.create_xml(item)

    def parse_shop_look(self, hxs):
        products = hxs.select('//div[@id="cat-ens-prod-item"]')
        i = 0
        # do this with actual id
        item = ExpressItem()
        whole_page = hxs.extract()
        whole_page = "".join(whole_page)
        ensemble_id = basic.get_middle_text(whole_page, "ensembleId: '", "',")
        name = hxs.select('//div[@id="cat-ens-prod-con"]/h1/text()').extract()
        name = basic.clean_string_field(name)
        item['ensemble_id'] = ensemble_id
        item['normal_image_url'] = self.shl_get_image(hxs)
        item['product_id'] = ["DUMMIE1"]
        item['shop_look'] = ['True']
        item['normal'] = ['False']
        item['shop_line'] = ['False']
        item['in_stock'] = ['IN_STOCK']
        item['name'] = name
        xml.create_xml(item)
        item.clear()
        for p in products:
            i += 1
            item = ExpressItem()
            item['master_product_id'] = ['DUMMIE1']
            item['product_id'] = ["DUMMIE1_" + str(i)]
            item['name'], item['price'], item['style'] = self.shl_basic_info(p)
            page = p.extract()
            item['variants'] = basic.cdata_field([self.shl_create_variants(self.get_variants(page))])
            item['colors'] = basic.cdata_field(self.shl_get_swatches(p))
            xml.create_xml(item)
        # return images for download here once it's needed

    def get_categories(self, hxs):
        category_id = hxs.select('//input[@name="categoryId"]/@value').extract()
        sub_category_id = hxs.select('//input[@name="subCategoryId"]/@value').extract()
        return category_id, sub_category_id

    def get_add_to_cart_id(self, hxs):
        return hxs.select('//input[@name="productId"]/@value').extract()

    def shl_get_image(self, hxs):
        page = hxs.extract()
        image = basic.get_middle_text(page, 'imagesets = "', '";')
        image = "http://t.express.com/com/scene7/s7d5/=/is/image/expressfashion/%s/i81" % (image[0])
        return [image]

    def shl_create_variants(self, f):
        """Creates variants for shop look products.
        Stored in dict with all info and returned as json"""
        d_main = {}
        n = []
        colors = [p for p in f[0]]
        for c in colors:
            d = {'color': c, 'ids': f[0][c]}
            try:
                d['sizes'] = f[1][c]
            except StandardError:
                print "This product has no sizes"
            d['prices'] = f[2][c]
            n.append(d)
        d_main['variants'] = n
        return simplejson.dumps(n)

    def shl_get_swatches(self, hxs):
        """Function for getting swatches for shop look way.
        Stores information in dict (name, swatch_url and image url)"""
        p = hxs.select('div[@class="cat-ens-prod-info"]/div[@class="cat-ens-prod-swatch-display"]')
        p = p.select('span/text()').extract()
        l = []
        d = {}
        for c in p:
            temp = c.split(",")
            d['name'] = temp[0]
            d['swatch_url'] = temp[1]
            d['image_url'] = temp[2]
            l.append(simplejson.dumps(d))
        return l

    def shl_basic_info(self, hxs):
        name = hxs.select('div[@class="cat-ens-prod-info"]/h1/text()').extract()
        name = basic.clean_string_field(name)
        price = hxs.select('div[@class="cat-ens-prod-info"]/span/text()').extract()
        price = basic.clean_spaces_field(basic.clean_string_field(price))
        style = hxs.select('div[@class="cat-ens-prod-info"]/text()').extract()
        if len(style) > 2:
            style = [basic.clean_string(style[1])]
        else:
            style = []
        return name, price, style

    def create_color_json(self, urls, names):
        d = {}
        n = []
        for i in range(0, len(urls)):
            d['url'] = urls[i]
            d['name'] = names[i]
            n.append(simplejson.dumps(d))
        return n

    def get_basic_info(self, hxs):
        """Gets basic info about products.
        Returns description and promo text"""
        description = hxs.select('//li[@class="cat-pro-desc"]').extract()[0]
        description = basic.clean_string(description)
        description = [basic.cdata(description)]
        promo_text = hxs.select('//span[@class="cat-pro-promo-text"]/text()').extract()
        if not promo_text:
            promo_text = hxs.select('//span[@class="cat-pro-promo-text"]/font').extract()
        if promo_text:
            promo_text = basic.cdata_field(promo_text)
        return description, promo_text

    def get_name(self, hxs):
        name = hxs.select('//div[@id="cat-pro-con-detail"]/h1/text()').extract()[0]
        name = [basic.clean_string(name)]
        return name

    def get_product_prices(self, hxs):
        """Gets product prices, regular and discount if it exists.
        If no discount returns empty field."""
        price = hxs.select('//li[@class="cat-pro-price"]/strong/text()').extract()
        discount_price = []
        if not price:
            price = hxs.select('//li[@class="cat-pro-price"]/span[@class="cat-glo-tex-oldP"]/text()').extract()
            discount_price = hxs.select('//li[@class="cat-pro-price"]/span[@class="cat-glo-tex-saleP"]/text()').extract()
        if discount_price:
            discount_price = [re.sub('[^0-9.,]', '', discount_price[0])]
        price = [re.sub('[^0-9.,]', '', price[0])]
        return price, discount_price

    def get_product_id(self, hxs):
        """Gets product sku from the page as a field"""
        sku = hxs.select('//input[@name="omnitureStyleID"]/@value').extract()[0]
        sku = sku.replace(";", "")
        return [sku]

    def get_swatch_images(self, hxs):
        """Function for getting swatch images info (names, urls, image names and urls).
        Also it gets and json as list of json urls for getting images set for every color."""
        urls = hxs.select('//li[@id="widget-product-swatches"]/a/img/@src').extract()
        color_names = hxs.select('//li[@id="widget-product-swatches"]/a/img/@alt').extract()
        swatch_image_names = self.get_swatch_image_name(urls)
        if not swatch_image_names and not color_names:
            color_names.append("no_color")
            swatch_image_names = self.get_imagesets(hxs)
        jsons = self.get_json(swatch_image_names)
        return color_names, urls, swatch_image_names, jsons

    def get_imagesets(self, hxs):
        """Function for getting image set in case where there is no color for product.
        Gets image set info from the javascript on the page and selects only first one,
        if there is more because there is only one color to associate with (no_color)"""
        page = hxs.extract()
        print len(page)
        iset = basic.get_middle_text(page, 'imagesets = "', '"; //Change')
        iset = iset[0].split(',')
        return [iset[0]]

    def get_swatch_image_name(self, image_sites):
        """Gets swatch image name from swatch image url"""
        image_names = []
        for x in range(0, len(image_sites)):
            name = basic.get_middle_text(image_sites[x], "fashion/", "_s")[0]
            image_names.append(name)
        return image_names

    def get_json(self, image_names):
        """Gets list of jsons from list of swatch images names"""
        jsons = []
        for i in range(0, len(image_names)):
            json = "http://s7d5.scene7.com/is/image/expressfashion/" + image_names[i] + "?req=imageset,json"
            jsons.append(json)
        return jsons

    def parse_jsons(self, jsons, color_names):
        """Parsing json from json urls.
        Returning all images in field, also returns them grouped by colors,
        so those groups can be used later when creating child products in xml"""
        images = []
        images_grouped = {}
        for i in range(0, len(jsons)):
            json = urllib2.urlopen(jsons[i]).read()
            image = basic.get_middle_text(json, '"expressfashion/', ";")
            rest_of_images = basic.get_middle_text(json, ',expressfashion/', ";")
            temp = image + rest_of_images
            images_grouped = basic.add_to_dict(images_grouped, color_names[i], temp)
            images += temp
        return self.get_absolute_url(images), images_grouped

    def get_absolute_url(self, images):
        """Gets absolute path for images.
        Receives field of relative path images and returns absolute paths"""
        image_urls = []
        for x in range(0, len(images)):
            image_url = "http://s7d5.scene7.com/is/image/expressfashion/" + images[x]
            image_url += "?width=351"
            image_urls.append(image_url)
        return image_urls

    def get_variants(self, page):
        """Getting variants from javascript on the page.
        Returns three dicts ids, sizes and prices. Format of the dicts is like
        (key = color, value = field of (ids, sizes and prices))"""
        temp = page.split("// Load the product variants")[1]
        temp = temp.split("// Set the field to update with the product variant")[0]
        variants = temp.split("// Create the variant")
        sizes = {}
        ids = {}
        prices = {}
        for i in range(1, len(variants)):
            color = basic.get_middle_text(variants[i], "Color','", "')")
            if color:
                color = color[0]
            else:
                color = "no_color"
            ids = basic.add_to_dict(ids, color, basic.get_middle_text(variants[i], "setId('", "')")[0])
            if variants[i].find("Size','") != -1:
                sizes = basic.add_to_dict(sizes, color, basic.get_middle_text(variants[i], "Size','", "')")[0])
            prices = basic.add_to_dict(prices, color, basic.get_middle_text(variants[i], 'numericPrice="', '"')[0])
        return ids, sizes, prices

    def get_image_url(self, images, is_swatch=False):
        """Returns path for images on our servers.
        If it's for swatch it return also swatch paths."""
        image_paths = []
        thumb_paths = []
        for x in range(0, len(images)):
            path = normal_image_url + images[x] + ".jpg"
            thumb_path = thumb_image_url + images[x] + ".jpg"
            image_paths.append(path)
            thumb_paths.append(thumb_path)
        if is_swatch is True:
            return image_paths
        else:
            return image_paths, thumb_paths

    def create_child_products(self, main_id, ids, sizes, prices, images_grouped):
        """Creating child products (both colors and sizes).
        Arguments it gets are: main_id as product id of the master product,
        images_grouped that is a dict of images grouped by color (field i field)
        and dicts ids, sizes and prices (e.g. dict with color names as keys and
        fields of ids for it as values 'black': ['32854, '32855''])"""
        item = ExpressItem()
        i = 0
        for k in ids:
            cur_id = main_id + "_" + chr(i + 97)
            item['product_id'] = [cur_id]
            item['master_product_id'] = [main_id]
            item['color'] = [k]
            # use this for some other path (our server)
#            images, thumbs = self.get_image_url(images_grouped[i])
            if images_grouped:
                images = self.get_absolute_url(images_grouped[k])
    #            item['normal_image_url'], item['thumb_image_url'] = self.map_url_to_server(images,main_id)
                item['normal_image_url'] = basic.cdata_field(self.map_url_to_server(images, main_id))
            self.xml.create_xml(item)
            item.clear()
            j = 0
            for val in ids[k]:
                item['product_id'] = [cur_id + "_" + chr(j + 97)]
                item['master_product_id'] = [cur_id]
                if len(sizes):
                    item['size'] = [sizes[k][j]]
                item['size_option_id'] = [ids[k][j]]
                item['price'] = [prices[k][j]]
                self.xml.create_xml(item)
                j += 1
            i += 1

    def map_url_to_server(self, urls, main_id, is_swatch=False):
        return urls
        new = []
        new1 = []
        for i in range(0, len(urls)):
            new.append(image_path + "/" + main_id + "/full/" + hashlib.sha1(urls[i]).hexdigest() + ".jpg")
            if is_swatch is False:
                new1.append(image_path + "/" + main_id + "/thumb/" + hashlib.sha1(urls[i]).hexdigest() + ".jpg")
        if is_swatch is True:
            return new
        else:
            return new, new1

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = ""
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total)
        # filename for writing xml
        self.xml.write_xml(self.name, self.d['file'])
        msg += self.exc.create_message(self.counter)
        msg += "\n{0}".format(self.temp_msg)
        exp = CommonExport()
        # part for exporting to database here
        if self.d['upload']:
            try:
                exp.xml_to_db(self.name, self.d['file'], "e2b3b658-16d5-4059-a9df-3c212c817d2c")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        msg += self.log.get_message()
        from modules.mail import Mail
        mail = Mail()
        mail.send_mail(msg, "Express scraper report")

    def get_lists_from_excel(self):
        xls = CommonExcel(basic.get_excel_path(self.name, self.d['file']))
        self.ordered = True
        try:
            self.url_list = xls.read_excel_collumn_for_urls(4, 1)
            self.id_list = xls.read_excel_collumn_for_ids(0, 1)
            self.shop_look_list = xls.read_excel_collumn(1, 1)
            self.shop_line_list = xls.read_excel_collumn(2, 1)
            try:
                self.order_list = xls.read_excel_collumn_for_ids(6, 1)
            except:
                self.ordered = False
                self.log.add_message("No order provided in this sheet.")
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)

    def add_properties(self, xml):
        xml.add_property("size_option_id", "Size Option Id", "text")
        xml.add_property("color_image_url", "Color Image Url", "text_list")
        xml.add_property("colors", "Colors", "text_list")
        xml.add_property("variants", "Variants", "text_list")
        xml.add_property("style", "Style", "text")
        xml.add_property("mode", "Mode", "text")
        xml.add_property("shop_look", "Shop look", "boolean")
        xml.add_property("shop_line", "Shop line", "boolean")
        xml.add_property("normal", "Normal", "boolean")
        xml.add_property("ensemble_id", "Ensemble ID", "text")
        xml.add_property("promo_text", "Promo text", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("product_page", "Product page", "text")
        xml.add_property("master_price", "Master Price", "decimal")
        xml.add_property("subcategory_id", "Sub Category ID", "text")
        xml.add_property("add_to_cart_id", "Add to cart ID", "text")
        xml.add_property("order_index", "Order Index", "integer")

Exemple #2

0

Afficher le fichier

class LydiasSpider(CrawlSpider):
    name = "lydias"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(LydiasSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = VariantsXml()
        self.exc = ZmagsException(5)
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(
                self.d['catalog_id'], self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        # fix for bug with links they provide
        self.products['urls'] = basic.cut_string_field(self.products['urls'],
                                                       "&cat=")
        self.handle_not_provided()
        self.start_urls = self.products['urls']
        self.images_store = "/" + settings['IMAGES_STORE']
        lydias.add_properties(self.xml)
        self.total = len(self.products['urls'])

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = LydiasItem()
        if 'redirect_urls' in response.request.meta:
            cur_url = response.request.meta['redirect_urls'][0]
        else:
            cur_url = response.url
        index = self.products['urls'].index(cur_url)
        id = self.products['product_ids'][index]
        try:
            available = hxs.select('//div[@id="searchfor"]/text()').extract()
            if not available:
                item['product_id'] = [id]
                item['name'], item['price'], item['old_price'], item[
                    'description'] = self.get_basic_info(hxs)
                item['rating'], item['custom_rating'] = self.get_rating(hxs)
                chart = self.absolute_path(self.get_size_image(hxs))
                item['sizes_chart_image_url'] = self.get_server_path(chart)
                color_urls, color_names, product_image, color_codes = self.get_image_swatches(
                    hxs)
                color_urls = self.absolute_path(color_urls)
                item['color_image_url'] = self.make_colors_json(
                    color_urls, color_names, color_codes)
                item['in_stock'] = ["IN_STOCK"]
                item['embroidery'] = self.get_embroidery(hxs)
                default_images = self.absolute_path(self.get_extra_images(hxs))
                item['default_image_url'] = self.get_server_path(
                    default_images)
                self.xml.create_xml(item)
                product_image = self.absolute_path(product_image)
                self.create_subproducts(id, color_names, product_image,
                                        color_codes, hxs)
                item[
                    'image_urls'] = product_image + color_urls + chart + default_images
                self.products['status'][index] = "ran"
            else:
                self.exc.code_handler(102, response.url)
                item['product_id'] = [id]
                item['in_stock'] = ["NOT_AVAILABLE"]
                self.products['status'][index] = "not_avail"
                self.xml.create_xml(item)
        except:
            self.products['status'][index] = "error"
            self.exc.code_handler(100, response.url)
        return item

    # function for checking if product has embroidery or not
    def get_embroidery(self, hxs):
        page = hxs.select('//html').extract()[0]
        if "document.getElementById('logocolor').disabled = true;" in page:
            return ["True"]
        else:
            return ["False"]

    # function for creating json with all information for colors
    def make_colors_json(self, color_urls, color_names, color_codes):
        dict = {}
        jsons = []
        for i in range(0, len(color_urls)):
            dict['color_url'] = self.get_server_path_single(color_urls[i])
            dict['color_name'] = color_names[i]
            dict['color_short'] = color_codes[i]
            json = basic.cdata(simplejson.dumps(dict))
            jsons.append(json)
        return jsons

    # function for getting image server path
    def get_server_path_single(self, url):
        #        return url
        return self.images_store + "/full/" + hashlib.sha1(
            url).hexdigest() + ".jpg"

    # function for getting image path for field of images
    def get_server_path(self, urls):
        #        return urls
        new = []
        for url in urls:
            new.append(self.images_store + "/full/" +
                       hashlib.sha1(url).hexdigest() + ".jpg")
        return new

    #function for getting basic information for product
    def get_basic_info(self, hxs):
        name = hxs.select('//div[@id="proddetail"]/h1/text()').extract()
        price = hxs.select(
            '//div[@id="proddetail"]/div[@class="yourprice bigprice"]/text()'
        ).extract()
        description = basic.cdata(
            hxs.select('//div[@id="details"]').extract()[0])
        description = basic.clean_string(description)
        old_price = hxs.select(
            '//span[@class="yourprice_product"]/text()').extract()
        if not price:
            price = hxs.select('//span[@id="PriceDisplay"]/text()').extract()
        if old_price:
            old_price = [re.sub('[^0-9.]', '', old_price[0])]
        price = [re.sub('[^0-9.]', '', price[0])]
        return name, price, old_price, [description]

    # function for getting rating, both number and sentence (e.g. Rating 5 out of 6 votes)
    def get_rating(self, hxs):
        temp = hxs.select('//div[@id="Customerssay"]/p[2]/text()').extract()
        if temp:
            rating = basic.get_middle_text(temp[0].replace(" ", ""), "Rating:",
                                           "out")
            return rating, temp
        else:
            return [], temp

    #function for getting reviews, returning rating and field of json reviews
    # or empty fields if there's no reviews
    def get_reviews(self, hxs):
        reviews = hxs.select('//div[@class="prodReview"]')
        if reviews:
            title = reviews[0].select(
                'p[@class="review_title"]/text()').extract()
            text = reviews[0].select(
                'p[@class="review_text"]/text()').extract()
            author = reviews[0].select(
                'p[@class="review_author"]/text()').extract()
            location = reviews[0].select(
                'p[@class="review_location"]/text()').extract()
            jsons = self.make_reviews_json(title, text, author, location)
            return jsons
        else:
            return []

    # function for making json for reviews
    # currently not in use. cause there are no reviews in DPW design
    def make_reviews_json(self, title, text, author, location):
        jsons = []
        print len(title)
        print len(text)
        print len(author)
        print len(location)
        os._exit(0)
        for i in range(0, len(title)):
            json = '{ "title" : " %s ", "text" : "%s", "author" : "%s", "location" :\
                    "%s" }' % (title[i], text[i], author[i], location[i])
            json = basic.cdata(json)
            jsons.append(json)
        return jsons

    #function for getting size chart image
    def get_size_image(self, hxs):
        temp = hxs.select(
            '//div[@class="TabbedPanelsContent cells"]/img/@src').extract()
        return temp

    #function for getting image swatches, returning fields (image_urls, image name, product color image)
    def get_image_swatches(self, hxs):
        colors = hxs.select('//div[@class="lolite"]')
        color_images = []
        color_names = []
        products_image = []
        color_codes = []
        for color in colors:
            color_images.append(color.select('a/img/@src').extract()[0])
            color_names.append(color.select('a/img/@alt').extract()[0])
            #if zoom image needed, this is the place to get it
            products_image.append(color.select('a/@rev').extract()[0])
            color_codes.append(
                color.select('a/@onclick').extract()[0].split(",")[1].replace(
                    "'", ""))
        return color_images, color_names, products_image, color_codes

    #function for getting additional images, returns field of images or empty field if there is no
    def get_extra_images(self, hxs):
        additional_images = hxs.select(
            '//div[@id="AddImg"]/script/text()').extract()
        if additional_images:
            temp = basic.get_middle_text(additional_images[0], '"', '"')
            thumb_images = temp[0].split(",")
            return thumb_images
        else:
            return []

    #function for getting product id from the page
    def get_product_id(self, hxs):
        temp = hxs.select('//div[@id="wrap"]/script/text()').extract()
        id = basic.get_middle_text(temp[0], 'productid","', '"')
        return id[0]

    # function for getting sizes from another url, retunrning field of jsons for sizes
    # one id from the page is 115NB, if needed here to hardcode for testing
    # currently not in use
    def get_sizes(self, id, hxs):
        showmode = hxs.select('//input[@name="showmode"]/@value').extract()[0]
        itemmode = hxs.select('//input[@name="itemmode"]/@value').extract()[0]
        salemode = hxs.select('//input[@name="salemode"]/@value').extract()[0]
        url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=AV&opt2=-1&type2=l1type" % (
            id)
        url += "&type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=429" % (
            showmode, itemmode, salemode)
        jsons = []
        print "reading page..."
        page = urllib2.urlopen(url).read()
        print "page read"
        page = page.replace("'", "")
        page = page.replace("[", ",")
        page = page.replace(",,", "")
        temp = page.split("]")
        for i in range(0, len(temp) - 2):
            tmp = temp[i].split(",")
            json = '{ "size_short" : " %s ", "size_full" : "%s", "some_number" :\
                    "%s", "some_id" : "%s" }' % (tmp[0], tmp[1], tmp[2],
                                                 tmp[3])
            json = basic.cdata(json)
            jsons.append(json)
        return jsons

    # function that handles creating subproducts, can be implemented for the usual way product for every combination
    # of size and color if needed
    def create_subproducts(self, id, color_names, product_image, color_codes,
                           hxs):
        item = LydiasItem()
        # if no colors for specific product do this part and call to creating size children with empty string instead
        # of actual color name
        if len(color_names) == 0:
            item['master_product_id'] = [id]
            item['product_id'] = [id + "_" + "0"]
            item['color'] = ["NO_COLOR"]
            item['custom_size'] = self.create_sizes_subproducts(
                id, id + "_" + "0", "", hxs)
            self.xml.create_xml(item)

        # for handling cases when there are color options for specific product, create child for every color, and call
        # for creating size children for every provided color
        else:
            for i in range(0, len(color_names)):
                print "name :" + color_names[i] + "  code:" + color_codes[i]
                item['master_product_id'] = [id]
                item['product_id'] = [id + "_" + str(i)]
                item['color'] = [color_names[i]]
                item['color_short'] = [color_codes[i]]
                item['normal_image_url'] = self.get_server_path(
                    [product_image[i]])
                item['in_stock'] = ["IN_STOCK"]
                item['custom_size'] = self.create_sizes_subproducts(
                    id, id + "_" + str(i), color_codes[i], hxs)
                self.xml.create_xml(item)
                item.clear()
        return 0

    # function for creating child products for sizes
    # little messy with all the commented lines but those lines can be used if needed to go back to old way with
    # child products instead of json
    def create_sizes_subproducts(self, main_id, id, color_code, hxs):
        print color_code
        jsons = []
        # if block for cases when color is provided
        if color_code != "":
            showmode = hxs.select(
                '//input[@name="showmode"]/@value').extract()[0]
            itemmode = hxs.select(
                '//input[@name="itemmode"]/@value').extract()[0]
            salemode = hxs.select(
                '//input[@name="salemode"]/@value').extract()[0]
            url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=%s&opt2=-1&type2=l1type&" \
                "type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=193" % (main_id, color_code, showmode, itemmode, salemode)
            page = urllib2.urlopen(url).read()
            page = page.replace("'", "")
            page = page.replace("[", ",")
            page = page.replace(",,", "")
            temp = page.split("]")
            for i in range(0, len(temp) - 2):
                tmp = temp[i].split(",")
                item = {}
                #                item['master_product_id'] = [id]
                item['size_short'] = tmp[0]
                item['price_url'] = self.get_size_price(
                    str(main_id), str(color_code), tmp[0])
                item['size'] = tmp[1]
                #                item['product_id'] = [id + "_" + str(i)]
                #                item['in_stock'] = ["IN_STOCK"]
                #                xml.create_xml(item)
                jsons.append(basic.cdata(simplejson.dumps(item)))
            return jsons

        # when the color is not provided different block of code cause it's done differently on the page
        else:
            temp = hxs.select('//div[@class="not_size"]/text()').extract()
            for i in range(0, len(temp)):
                item = {}
                #                item['master_product_id'] = [id]
                #                item['product_id'] = [id + "_" + str(i)]
                item['size_short'] = temp[i]
                item['price_url'] = self.get_size_price(
                    str(main_id), "", temp[i])
                #                item['in_stock'] = ["IN_STOCK"]
                #                xml.create_xml(item)
                jsons.append(basic.cdata(simplejson.dumps(item)))
            return jsons

#        return 0

# function for getting price for combination of every size and color, can return url where the price is, or can
# parse that url to get that actual price but will drastically increase scraping time

    def get_size_price(self, id, color, size):
        if color != "":
            url = "http://www.lydiasuniforms.com/ajaxed/product-showprice.asp?sku=%s %s %s&qty=1&itemmode=" \
                  "0&showmode=1&rnum=388" % (str(id), str(color), size)
        else:
            url = "http://www.lydiasuniforms.com/ajaxed/product-showprice.asp?sku=%s %s&qty=1&itemmode=" \
                  "0&showmode=1&rnum=259" % (id, size)
        url = url.replace(" ", "%20")
        return url

    # just adding part for getting absolute paths for relative paths from page
    def absolute_path(self, urls):
        new = []
        for i in urls:
            new.append("http://www.lydiasuniforms.com" + i)
        return new

    # function used for gettin embroidery information from clients page, was used only once to get it
    # cause embroidery is the same for all the products
    def get_emb(self, hxs):
        emb = hxs.select('//div[@id="emb"]').extract()
        lettering_colors = hxs.select(
            '//select[@id="threadcolor"]/option/@value').extract()
        urls = []
        d = {}
        colors = []
        for i in range(1, len(lettering_colors)):
            d['type'] = "lettering colors"
            d['name'] = lettering_colors[i]
            url = "http://www.lydiasuniforms.com/images/lydias/threadcolor_"
            url += lettering_colors[i].lower().replace(' ', '_') + ".gif"
            d['url'] = self.get_server_path_single(url)

            urls.append(url)
            colors.append(basic.cdata(simplejson.dumps(d)))
        lettering = hxs.select(
            '//select[@id="lettering"]/option/@value').extract()
        l = {}
        letterings = []
        for i in range(1, len(lettering)):
            l['type'] = "lettering"
            l['name'] = lettering[i]
            url = "http://www.lydiasuniforms.com/images/lydias/lettering_"
            url += lettering[i].lower().replace(' ', '_') + ".gif"
            l['url'] = self.get_server_path_single(url)
            letterings.append(basic.cdata(simplejson.dumps(l)))
            urls.append(url)
        logo = hxs.select('//select[@id="logoname"]/option/@value').extract()
        logos = {}
        log = []
        for i in range(1, len(logo)):
            logos['type'] = "logo"
            logos['name'] = logo[i]
            url = "http://www.lydiasuniforms.com/images/logos/"
            url += logo[i].lower() + ".jpg"
            logos['url'] = self.get_server_path_single(url)
            urls.append(url)
            log.append(basic.cdata(simplejson.dumps(logos)))
        item = LydiasItem()
        item['color'] = colors
        item['lettering'] = letterings
        item['log'] = log
        xml.create_xml(item)
        xml.write_xml("emb")

        return urls
        print colors, letterings, log
        os._exit(0)

    def handle_not_provided(self):
        item = LydiasItem()
        for n in self.no_urls['product_ids']:
            item['product_id'] = [n]
            index = self.no_urls['product_ids'].index(n)
            item['name'] = [self.no_urls['names'][index]]
            item['in_stock'] = ['NOT_AVAILABLE']
            self.xml.create_xml(item)

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = ""
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter,
                                                         self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        #if self.d['upload']:
        #exp = CommonExport()
        #try:
        #exp.xml_to_db(self.name, filename, "4b0d6b52-7b05-4e54-9d87-dfe77ac270c9")
        #msg += "\n\nExport to database successful"
        #except StandardError:
        #msg += "\n\nExport to database failed"
        #else:
        #msg += "\n\nUpload to database not selected"
        ## part for exporting to database here
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "Lydias: {0}".format(filename))
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15)
            self.products['product_ids'] = xls.read_excel_collumn_for_ids(
                1, 15)
            self.products['names'] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        else:
            self.products = xls.delete_duplicates_dict(self.products)
            self.products, self.no_urls = xls.separate_no_urls(self.products)
            self.products = xls._add_none_status(self.products)
            self.no_urls = xls._add_none_status(self.no_urls)

Exemple #3

0

Afficher le fichier

Fichier : lydias_spider.py Projet : marjevtic/testMarko

class LydiasSpider(CrawlSpider):
    name = "lydias"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(LydiasSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = VariantsXml()
        self.exc = ZmagsException(5)
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                        self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        # fix for bug with links they provide
        self.products['urls'] = basic.cut_string_field(self.products['urls'], "&cat=")
        self.handle_not_provided()
        self.start_urls = self.products['urls']
        self.images_store = "/" + settings['IMAGES_STORE']
        lydias.add_properties(self.xml)
        self.total = len(self.products['urls'])

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = LydiasItem()
        if 'redirect_urls' in response.request.meta:
            cur_url = response.request.meta['redirect_urls'][0]
        else:
            cur_url = response.url
        index = self.products['urls'].index(cur_url)
        id = self.products['product_ids'][index]
        try:
            available = hxs.select('//div[@id="searchfor"]/text()').extract()
            if not available:
                item['product_id'] = [id]
                item['name'], item['price'], item['old_price'], item['description'] = self.get_basic_info(hxs)
                item['rating'], item['custom_rating'] = self.get_rating(hxs)
                chart = self.absolute_path(self.get_size_image(hxs))
                item['sizes_chart_image_url'] = self.get_server_path(chart)
                color_urls, color_names, product_image, color_codes = self.get_image_swatches(hxs)
                color_urls = self.absolute_path(color_urls)
                item['color_image_url'] = self.make_colors_json(color_urls, color_names, color_codes)
                item['in_stock'] = ["IN_STOCK"]
                item['embroidery'] = self.get_embroidery(hxs)
                default_images = self.absolute_path(self.get_extra_images(hxs))
                item['default_image_url'] = self.get_server_path(default_images)
                self.xml.create_xml(item)
                product_image = self.absolute_path(product_image)
                self.create_subproducts(id, color_names, product_image, color_codes, hxs)
                item['image_urls'] = product_image + color_urls + chart + default_images
                self.products['status'][index] = "ran"
            else:
                self.exc.code_handler(102, response.url)
                item['product_id'] = [id]
                item['in_stock'] = ["NOT_AVAILABLE"]
                self.products['status'][index] = "not_avail"
                self.xml.create_xml(item)
        except:
            self.products['status'][index] = "error"
            self.exc.code_handler(100, response.url)
        return item

     # function for checking if product has embroidery or not
    def get_embroidery(self, hxs):
        page = hxs.select('//html').extract()[0]
        if "document.getElementById('logocolor').disabled = true;" in page:
            return ["True"]
        else:
            return ["False"]

    # function for creating json with all information for colors
    def make_colors_json(self, color_urls, color_names, color_codes):
        dict = {}
        jsons = []
        for i in range(0, len(color_urls)):
            dict['color_url'] = self.get_server_path_single(color_urls[i])
            dict['color_name'] = color_names[i]
            dict['color_short'] = color_codes[i]
            json = basic.cdata(simplejson.dumps(dict))
            jsons.append(json)
        return jsons

    # function for getting image server path
    def get_server_path_single(self, url):
#        return url
        return self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg"

    # function for getting image path for field of images
    def get_server_path(self, urls):
#        return urls
        new = []
        for url in urls:
            new.append(self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg")
        return new

    #function for getting basic information for product
    def get_basic_info(self, hxs):
        name = hxs.select('//div[@id="proddetail"]/h1/text()').extract()
        price = hxs.select('//div[@id="proddetail"]/div[@class="yourprice bigprice"]/text()').extract()
        description = basic.cdata(hxs.select('//div[@id="details"]').extract()[0])
        description = basic.clean_string(description)
        old_price = hxs.select('//span[@class="yourprice_product"]/text()').extract()
        if not price:
            price = hxs.select('//span[@id="PriceDisplay"]/text()').extract()
        if old_price:
            old_price = [re.sub('[^0-9.]', '', old_price[0])]
        price = [re.sub('[^0-9.]', '', price[0])]
        return name, price, old_price, [description]

    # function for getting rating, both number and sentence (e.g. Rating 5 out of 6 votes)
    def get_rating(self, hxs):
        temp = hxs.select('//div[@id="Customerssay"]/p[2]/text()').extract()
        if temp:
            rating = basic.get_middle_text(temp[0].replace(" ", ""), "Rating:", "out")
            return rating, temp
        else:
            return [], temp

    #function for getting reviews, returning rating and field of json reviews
    # or empty fields if there's no reviews
    def get_reviews(self, hxs):
        reviews = hxs.select('//div[@class="prodReview"]')
        if reviews:
            title = reviews[0].select('p[@class="review_title"]/text()').extract()
            text = reviews[0].select('p[@class="review_text"]/text()').extract()
            author = reviews[0].select('p[@class="review_author"]/text()').extract()
            location = reviews[0].select('p[@class="review_location"]/text()').extract()
            jsons = self.make_reviews_json(title, text, author, location)
            return jsons
        else:
            return []

    # function for making json for reviews
    # currently not in use. cause there are no reviews in DPW design
    def make_reviews_json(self, title, text, author, location):
        jsons = []
        print len(title)
        print len(text)
        print len(author)
        print len(location)
        os._exit(0)
        for i in range(0, len(title)):
            json = '{ "title" : " %s ", "text" : "%s", "author" : "%s", "location" :\
                    "%s" }' % (title[i], text[i], author[i], location[i])
            json = basic.cdata(json)
            jsons.append(json)
        return jsons

    #function for getting size chart image
    def get_size_image(self, hxs):
        temp = hxs.select('//div[@class="TabbedPanelsContent cells"]/img/@src').extract()
        return temp

    #function for getting image swatches, returning fields (image_urls, image name, product color image)
    def get_image_swatches(self, hxs):
        colors = hxs.select('//div[@class="lolite"]')
        color_images = []
        color_names = []
        products_image = []
        color_codes = []
        for color in colors:
            color_images.append(color.select('a/img/@src').extract()[0])
            color_names.append(color.select('a/img/@alt').extract()[0])
            #if zoom image needed, this is the place to get it
            products_image.append(color.select('a/@rev').extract()[0])
            color_codes.append(color.select('a/@onclick').extract()[0].split(",")[1].replace("'", ""))
        return color_images, color_names, products_image, color_codes

    #function for getting additional images, returns field of images or empty field if there is no
    def get_extra_images(self, hxs):
        additional_images = hxs.select('//div[@id="AddImg"]/script/text()').extract()
        if additional_images:
            temp = basic.get_middle_text(additional_images[0], '"', '"')
            thumb_images = temp[0].split(",")
            return thumb_images
        else:
            return []

    #function for getting product id from the page
    def get_product_id(self, hxs):
        temp = hxs.select('//div[@id="wrap"]/script/text()').extract()
        id = basic.get_middle_text(temp[0], 'productid","', '"')
        return id[0]

    # function for getting sizes from another url, retunrning field of jsons for sizes
    # one id from the page is 115NB, if needed here to hardcode for testing
    # currently not in use
    def get_sizes(self, id, hxs):
        showmode = hxs.select('//input[@name="showmode"]/@value').extract()[0]
        itemmode = hxs.select('//input[@name="itemmode"]/@value').extract()[0]
        salemode = hxs.select('//input[@name="salemode"]/@value').extract()[0]
        url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=AV&opt2=-1&type2=l1type" % (id)
        url += "&type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=429" % (showmode, itemmode, salemode)
        jsons = []
        print "reading page..."
        page = urllib2.urlopen(url).read()
        print "page read"
        page = page.replace("'", "")
        page = page.replace("[", ",")
        page = page.replace(",,", "")
        temp = page.split("]")
        for i in range(0, len(temp) - 2):
            tmp = temp[i].split(",")
            json = '{ "size_short" : " %s ", "size_full" : "%s", "some_number" :\
                    "%s", "some_id" : "%s" }' % (tmp[0], tmp[1], tmp[2], tmp[3])
            json = basic.cdata(json)
            jsons.append(json)
        return jsons

    # function that handles creating subproducts, can be implemented for the usual way product for every combination
    # of size and color if needed
    def create_subproducts(self, id, color_names, product_image, color_codes, hxs):
        item = LydiasItem()
        # if no colors for specific product do this part and call to creating size children with empty string instead
        # of actual color name
        if len(color_names) == 0:
            item['master_product_id'] = [id]
            item['product_id'] = [id + "_" + "0"]
            item['color'] = ["NO_COLOR"]
            item['custom_size'] = self.create_sizes_subproducts(id, id + "_" + "0", "", hxs)
            self.xml.create_xml(item)

        # for handling cases when there are color options for specific product, create child for every color, and call
        # for creating size children for every provided color
        else:
            for i in range(0, len(color_names)):
                print "name :" + color_names[i] + "  code:" + color_codes[i]
                item['master_product_id'] = [id]
                item['product_id'] = [id + "_" + str(i)]
                item['color'] = [color_names[i]]
                item['color_short'] = [color_codes[i]]
                item['normal_image_url'] = self.get_server_path([product_image[i]])
                item['in_stock'] = ["IN_STOCK"]
                item['custom_size'] = self.create_sizes_subproducts(id, id + "_" + str(i), color_codes[i], hxs)
                self.xml.create_xml(item)
                item.clear()
        return 0

    # function for creating child products for sizes
    # little messy with all the commented lines but those lines can be used if needed to go back to old way with
    # child products instead of json
    def create_sizes_subproducts(self, main_id, id, color_code, hxs):
        print color_code
        jsons = []
        # if block for cases when color is provided
        if color_code != "":
            showmode = hxs.select('//input[@name="showmode"]/@value').extract()[0]
            itemmode = hxs.select('//input[@name="itemmode"]/@value').extract()[0]
            salemode = hxs.select('//input[@name="salemode"]/@value').extract()[0]
            url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=%s&opt2=-1&type2=l1type&" \
                "type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=193" % (main_id, color_code, showmode, itemmode, salemode)
            page = urllib2.urlopen(url).read()
            page = page.replace("'", "")
            page = page.replace("[", ",")
            page = page.replace(",,", "")
            temp = page.split("]")
            for i in range(0, len(temp) - 2):
                tmp = temp[i].split(",")
                item = {}
#                item['master_product_id'] = [id]
                item['size_short'] = tmp[0]
                item['price_url'] = self.get_size_price(str(main_id), str(color_code), tmp[0])
                item['size'] = tmp[1]
#                item['product_id'] = [id + "_" + str(i)]
#                item['in_stock'] = ["IN_STOCK"]
#                xml.create_xml(item)
                jsons.append(basic.cdata(simplejson.dumps(item)))
            return jsons

        # when the color is not provided different block of code cause it's done differently on the page
        else:
            temp = hxs.select('//div[@class="not_size"]/text()').extract()
            for i in range(0, len(temp)):
                item = {}
#                item['master_product_id'] = [id]
#                item['product_id'] = [id + "_" + str(i)]
                item['size_short'] = temp[i]
                item['price_url'] = self.get_size_price(str(main_id), "", temp[i])
#                item['in_stock'] = ["IN_STOCK"]
#                xml.create_xml(item)
                jsons.append(basic.cdata(simplejson.dumps(item)))
            return jsons

#        return 0

    # function for getting price for combination of every size and color, can return url where the price is, or can
    # parse that url to get that actual price but will drastically increase scraping time
    def get_size_price(self, id, color, size):
        if color != "":
            url = "http://www.lydiasuniforms.com/ajaxed/product-showprice.asp?sku=%s %s %s&qty=1&itemmode=" \
                  "0&showmode=1&rnum=388" % (str(id), str(color), size)
        else:
            url = "http://www.lydiasuniforms.com/ajaxed/product-showprice.asp?sku=%s %s&qty=1&itemmode=" \
                  "0&showmode=1&rnum=259" % (id, size)
        url = url.replace(" ", "%20")
        return url

    # just adding part for getting absolute paths for relative paths from page
    def absolute_path(self, urls):
        new = []
        for i in urls:
            new.append("http://www.lydiasuniforms.com" + i)
        return new

    # function used for gettin embroidery information from clients page, was used only once to get it
    # cause embroidery is the same for all the products
    def get_emb(self, hxs):
        emb = hxs.select('//div[@id="emb"]').extract()
        lettering_colors = hxs.select('//select[@id="threadcolor"]/option/@value').extract()
        urls = []
        d = {}
        colors = []
        for i in range(1, len(lettering_colors)):
            d['type'] = "lettering colors"
            d['name'] = lettering_colors[i]
            url = "http://www.lydiasuniforms.com/images/lydias/threadcolor_"
            url += lettering_colors[i].lower().replace(' ', '_') + ".gif"
            d['url'] = self.get_server_path_single(url)

            urls.append(url)
            colors.append(basic.cdata(simplejson.dumps(d)))
        lettering = hxs.select('//select[@id="lettering"]/option/@value').extract()
        l = {}
        letterings = []
        for i in range(1, len(lettering)):
            l['type'] = "lettering"
            l['name'] = lettering[i]
            url = "http://www.lydiasuniforms.com/images/lydias/lettering_"
            url += lettering[i].lower().replace(' ', '_') + ".gif"
            l['url'] = self.get_server_path_single(url)
            letterings.append(basic.cdata(simplejson.dumps(l)))
            urls.append(url)
        logo = hxs.select('//select[@id="logoname"]/option/@value').extract()
        logos = {}
        log = []
        for i in range(1, len(logo)):
            logos['type'] = "logo"
            logos['name'] = logo[i]
            url = "http://www.lydiasuniforms.com/images/logos/"
            url += logo[i].lower() + ".jpg"
            logos['url'] = self.get_server_path_single(url)
            urls.append(url)
            log.append(basic.cdata(simplejson.dumps(logos)))
        item = LydiasItem()
        item['color'] = colors
        item['lettering'] = letterings
        item['log'] = log
        xml.create_xml(item)
        xml.write_xml("emb")

        return urls
        print  colors, letterings, log
        os._exit(0)

    def handle_not_provided(self):
        item = LydiasItem()
        for n in self.no_urls['product_ids']:
            item['product_id'] = [n]
            index = self.no_urls['product_ids'].index(n)
            item['name'] = [self.no_urls['names'][index]]
            item['in_stock'] = ['NOT_AVAILABLE']
            self.xml.create_xml(item)

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = ""
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        #if self.d['upload']:
            #exp = CommonExport()
            #try:
                #exp.xml_to_db(self.name, filename, "4b0d6b52-7b05-4e54-9d87-dfe77ac270c9")
                #msg += "\n\nExport to database successful"
            #except StandardError:
                #msg += "\n\nExport to database failed"
        #else:
            #msg += "\n\nUpload to database not selected"
        ## part for exporting to database here
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "Lydias: {0}".format(filename))
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15)
            self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15)
            self.products['names'] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        else:
            self.products = xls.delete_duplicates_dict(self.products)
            self.products, self.no_urls = xls.separate_no_urls(self.products)
            self.products = xls._add_none_status(self.products)
            self.no_urls = xls._add_none_status(self.no_urls)

Exemple #4

0

Afficher le fichier

Fichier : kenneth_spider.py Projet : marjevtic/testMarko

class KennethSpider(CrawlSpider):
    name = "kenneth"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(KennethSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.images_store = "/" + settings['IMAGES_STORE'] + "/"
        self.d = terminal.get_arguments()
        self.xml = VariantsXml()
        self.exc = ZmagsException(5)
        print self.d
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                        self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.no_url_products(self.no_urls)
        self.start_urls = self.products['urls'] 
        self.total = len(self.start_urls)

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = KennethItem()
        #main try for script, run general except if error happens in code (send
        # url on mail where it happened)
        try:
            cur_url = response.url
                # search for noResultContent div on the page, if it exists keep
                # track, that product doesn't exist on
                # their page, otherwise continue scraping page
            available = hxs.select('//div[@id="noResultsContent"]').extract()

            if not available:
                index = self.products['urls'].index(cur_url)
                cur_id = self.get_product_id(cur_url)
                id = self.products['product_ids'][index]
                page = hxs.select('//div[@id="mainContent"]').extract()
                page = " ".join(page)
                item['name'], item['description'] = self.get_basic_info(hxs)
                price, new_p, old_p = self.get_prices(hxs)
                if new_p:
                    item['new_price'] = new_p
                    item['old_price'] = old_p
                else:
                    item['price'] = price
                desc = basic.clean_string(item['description'][0])
                item['description'] = [desc]
                urls = self.get_color_image(hxs)
                new = self.get_image_server_path(urls, id)
                item['color_image_urls'] = new
                self.export(item['color_image_urls'], [id], "swatchImage")
                jsons, images = self.we_also_recommend(cur_id, id)
                item['product_page'] = [cur_url]
                item['product_id'] = [id]
                item['add_to_cart_id'] = [cur_id]
                item['recommended_product'] = jsons
                item['in_stock'] = ["IN_STOCK"]
                self.products['status'][index] = "ran"
                images_or_404 = self.get_colors(hxs, page, id)
                if images_or_404 == 404:
                    item['in_stock'] = ["NOT_AVAILABLE"]
                self.xml.create_xml(item)
                item['image_urls'] = []
                if images_or_404 != 404:
                    item['image_urls'] += images_or_404
                item['image_urls'] += urls
                item['image_urls'] += images
                #self.export(item['image_urls'])
                #item['image_urls'] = [] #uncomment for donwloading images 

            else:
                # part for handling products that are not available
                cur_id = self.get_product_id(cur_url)
                cur_url = "http://www.kennethcole.com/product/index.jsp?"
                cur_url += "productId=" + str(cur_id)
                index = self.products['urls'].index(cur_url)
                self.products['status'][index] = "no_avail"
                item['product_id'] = [self.products['product_ids'][index]]
                if self.products['product_ids'][index]:
                    item['name'] = [self.products['names'][index]]
                else:
                    item['name'] = ["not available"]
                item['in_stock'] = ["NOT_AVAILABLE"]
                self.xml.create_xml(item)
                self.exc.code_handler(102, cur_url)
        except:
            # part for catching errors and keeping track of numbers of
            # it and urls where it happened
            print "Error occured scraping this product"
            index = self.products['urls'].index(cur_url)
            self.products['status'][index] = "error"
            self.exc.code_handler(100, cur_url)
        return item

    def no_url_products(self, no_url):
        item = KennethItem()
        for n in no_url['product_ids']:
            item['product_id'] = [n]
            index = no_url['product_ids'].index(n)
            item['name'] = [no_url['names'][index]]
            item['in_stock'] = ['NOT_AVAILABLE']
            self.xml.create_xml(item)

    #function for getting basic product info from the page
    def get_basic_info(self, hxs):
        name = hxs.select('//div[@id="productInfoTop"]/h1/text()').extract()
        description = basic.cdata(hxs.select('//div[@id="productDescription"]').extract()[0])
        return name, [description]

    # function for getting prices from the page, nly one or new and old one if
    # that's the case
    def get_prices(self, hxs):
        price = hxs.select('//div[@id="productInfoTop"]/h2/text()').extract()[0]
        new_p = hxs.select('//h2[@class="sale-now"]/text()').extract()
        old_p = hxs.select('//span[@class="productGrey"]/text()').extract()
        price = re.sub('[^0-9.,]', '', price)
        return [price], new_p, old_p

    def get_color_image(self, hxs):
        return hxs.select('//div[@id="productInfoR2W"]/img/@src').extract()

    # function for gettng colors from javascript on the page, and writing them
    # in xml, from here is called function
    # for creating further sizes subproducts
    def get_colors(self, hxs, page, main_id):
        item = KennethItem()
        try:
            tmp = page.split('displays[0]')[1]
        except IndexError:
            print "This product is not available"
            return 404
        script = tmp.split('</script>')[0]
        displays = script.split("};")
        global counter
        ids = []
        images = []
        color_ids = []
        sizes_script = self.get_sizes_part_page(page)
        color_internal_code = {}

        for x in range(0, len(displays) - 1):
            id = basic.get_middle_text(displays[x], 'colorId: "', '"')
            ids.append(id[0])
            reg = displays[x].count("Reg")
            images_in = []
            for i in range(1, reg + 1):
                image = basic.get_middle_text(displays[x], "vw" + str(i) + 'Reg: "', '"')
                if len(image) == 0:
                    image = basic.get_middle_text(displays[x], "vw" + str(i) + 'Reg:"', '"')
                if (len(image) > 0):
                    if (image[0] != "null"):
                        images_in.append(image[0])

            if not images_in:
                images_in = hxs.select('//input[@name="productImage"]/@value').extract()
            color_ids.append(str(main_id) + "_" + str(x))
            item['product_id'] = [str(main_id) + "_" + str(x)]
            item['color_option_id'] = id
            item['master_product_id'] = [main_id]
            item['normal_image_url'] = self.get_image_server_path(images_in, main_id)
            item['thumb_image_url'] = self.get_image_server_path_thumb(images_in, main_id)
            item['in_stock'] = ["NOT_IN_STOCK"]
            item['color'] = self.get_color_name(sizes_script, id[0])
            color_internal_code[id[0]] = str(x)
            self.xml.create_xml(item)
            images += images_in
            self.export(item['normal_image_url'], item['product_id'], "productImage")
        self.get_sizes(sizes_script, ids, main_id, color_internal_code)
        return images

    # function for getting sizes for products from javascript, and storing 
    # information in dicts of format {id : information}
    def get_sizes(self, page, ids, main_id, color_internal_code):
        options = page.split("};")
        skus = {}
        colors_name = {}
        inStocks = {}
        sizes = {}
        prices = {}
        for x in range(0, len(options) - 1):
            id = basic.get_middle_text(options[x], 'cId: "', '"')
            for i in range(0, len(ids)):
                if (id[0] == ids[i]):
                    sku = basic.get_middle_text(options[x], 'sku: ', ',s')
                    sku = re.sub("[^0-9]", "", sku[0])
                    skus = self.add_to_dict(skus, ids[i], sku)
                    size = basic.get_middle_text(options[x], 'sDesc: "', '"')
                    sizes = self.add_to_dict(sizes, ids[i], size[0])
                    price = basic.get_middle_text(options[x], 'price: "', '"')
                    price = self.clean_price(price[0])
                    prices = self.add_to_dict(prices, ids[i], price[0])
                    available = basic.get_middle_text(options[x], 'avail: "', '"')
                    inStocks = self.add_to_dict(inStocks, ids[i], available[0])
        self.create_subproducts_xml(main_id, color_internal_code, colors_name, sizes, skus, inStocks, prices)
        return main_id, colors_name, sizes, skus, inStocks, prices

    # function for creating subproducts for every size
    def create_subproducts_xml(self, main_id, color_internal_code, colors_name, sizes, skus, inStocks, prices):
        number = 0
        global counter
        for k, v in sizes.iteritems():
            item = KennethItem()
            for i in range(0, len(v)):
                item['size'] = [v[i]]
                item['size_option_id'] = [skus[k][i]]
                m_id = main_id + "_" + color_internal_code[k]
                item['master_product_id'] = [m_id]
                id = m_id + "_" + str(i)
                item['product_id'] = [id]
                if inStocks[k][i] == "NOT_AVAILABLE":
                    item['in_stock'] = ["NOT_IN_STOCK"]
                elif inStocks[k][i] == "ADVANCED_SALE_LIMITED":
                    item['in_stock'] = ["IN_STOCK"]
                else:
                    item['in_stock'] = [inStocks[k][i]]
                item['price'] = [prices[k][i]]
                #item['color'] = colors_name[k]
                self.xml.create_xml(item)
            number += 1

    def add_to_dict(self, dict, index, value):
        try:
            dict[index].append(value)
        except:
            dict[index] = [value]
        return dict

    # function for getting we also recommend information about products from
    # their page, returns json list with information and images
    # list with images urls
    def we_also_recommend(self, id, main_id):
        url = "http://www.res-x.com/ws/r2/Resonance.aspx?appid=kennethcole01&t"
        url += "k=154212870918247&ss=525178103419747&sg=1&pg=897706724574618&b"
        url += "x=true&vr=2.67&sc=product_rr&ev=product&ei=" + id + "&cu=&ct=k"
        url += "ennethcolec01&no=3&cb=r1eh&clk=&cv1=" + id + "&cv23=63&ur=http%"
        url += "3A//www.kennethcole.com/product/index.jsp%3FproductId%3D3" + id
        url += "&plk=&rf="
        import urllib2
        page = urllib2.urlopen(url).read()
        temp = page.split("certonaRecBoxes")
        images = []
        ids = []
        names = []
        prices = []
        urls = []
        # parsing data got from the upper url about we also recommend products
        for i in range(1, len(temp)):
            id = [basic.get_middle_text(temp[i], "d=", '\\"')[0]]
            image = basic.get_middle_text(temp[i], 'src=\\"', '\\"')[0]
            name = basic.get_middle_text(temp[i], 'alt=\\"', '\\"')
            price = basic.get_middle_text(temp[i], '<br>', '</a>')
            url = "http://www.kennethcole.com/product/index.jsp?productId="
            url += id[0]
            urls.append(url)
            ids.append(id)
            names.append(name)
            prices.append(price)
            images.append(image)
        jsons = self.make_json(ids, names, prices, self.get_image_server_path(images, main_id), urls)
        return jsons, images

    # function for getting product id from the url
    def get_product_id(self, url):
        return url.split("=")[1]

    #function for making json
    def make_json(self, ids, names, prices, images, urls):
        jsons = []
        for i in range(0, len(ids)):
            json = "{" + ' "id" : "' + str(ids[i][0]) + '", '
            json += '"name" : "' + str(names[i][0]) + '", '
            # insert function for storing the right image path
            json += '"image_url" : "' + str(images[i]) + '", '
            json += '"product_url" : "' + urls[i] + '", '
            json += '"price" : "' + str(prices[i][0]) + '" } '
            json = basic.cdata(json)
            jsons.append(json)
        return jsons

    #function for getting javascript where sizes are handled
    def get_sizes_part_page(self, page):
        tmp = page.split("availDates = new Array();")[1]
        script = tmp.split("</script>")[0]
        return script

    # function for getting name of the color by id
    def get_color_name(self, script, id):
        temp = script.split(id)
        temp = temp[0].split('cDesc: "')
        temp = temp[len(temp) - 1]
        name = temp.split('"')[0]
        return [name]
        return {id: name}

    #function for exporting images to database via rest
    def export(self, images, id, tags):
        #set override to 0 for uploading images or else to skip uploading
        override = 1
        if override == 0:
            import MultipartPostHandler
            import urllib2
            import os
            url = 'http://api.admin.zmags.com/productImage/import?key=5ef90922-283b-4412-a1c8-3e70bc28b9d3'

            for i in range(0, len(images)):
                image_name = self.get_image_name(images[i])
                path = "images/kenneth_images/small/" + str(image_name)
                params = {'file': file(path, 'rb'), 'product_id': id[0],
                          'index': str(i + 1), 'tags': tags}
                          #token not working
                opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
                code = opener.open(url, params).getcode()

                if (code != 202):
                    print ("Achtung")
                global images_number
                images_number += 1
                print images_number

                print "Image uploaded to product " + id[0]
        else:
            #print "Image upload overriden.."
            pass

    #function for getting image name from url
    def get_image_server_path(self, urls, id):
#        print urls
        new = []
        for url in urls:
            temp = url.split("/")
            new.append(self.images_store + id + "/full/" + temp[len(temp) - 1])
        return new

    # function for getting image paths on our server
    def get_image_server_path_thumb(self, urls, id):
        new = []
        for url in urls:
            temp = url.split("/")
            new.append(self.images_store + id + "/small/" + temp[len(temp) - 1])
        return new

    def clean_price(self, price):
        return [re.sub('[^0-9.,]', '', price)]

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = ""
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped {0} product out of {1}\n\n".format(self.counter, self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        if self.d['upload']:
            exp = CommonExport()
            #try:
            exp.xml_to_db(self.name, filename, "29eac9ea-8c57-4d22-baf4-3f1471dc3ab6")
            msg += "\n\nExport to database successful"
            #except StandardError:
                #msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "KennethCole: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "KennethCole: {0}".format(filename), self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = 'logs/{0}'.format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products['urls'] = xls.read_excel_collumn_for_urls(2, 2)
            self.products['product_ids'] = xls.read_excel_collumn_for_ids(0, 2)
            self.products['names'] = xls.read_excel_collumn(1, 2)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        self.products = xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

    def add_properties(self, xml):
        xml.add_property("add_to_cart_id", "Add To Cart Id", "text")
        xml.add_property("product_page", "Product page", "text")
        xml.add_property("color_image_urls", "Color Image URLs", "text_list")
        xml.add_property("color_option_id", "Color Option ID", "text")
        xml.add_property("recommended_product", "Recommended Product", "text_list")
        xml.add_property("size_option_id", "Size Option ID", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("old_price", "Old Price", "text")
        xml.add_property("new_price", "New Price", "text")