Python ZmagsException.code_handler Exemples, modules.exception.ZmagsException.code_handler Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : guitar_center_spider.py Projet : marjevtic/testMarko

class GuitarCenterSpider(CrawlSpider):
    name = "guitar_center"
    allowed_domains = ["musiciansfriend.com"]
    start_urls = ["http://www.musiciansfriend.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(GuitarCenterSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5)
        if self.d["database"]:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d["catalog_id"], self.d["product_id"])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.handle_not_provided()
        self.start_urls = self.products["urls"]
        self.total = len(self.products["urls"])

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = GuitarCenterItem()
        from scrapy.conf import settings

        if "redirect_urls" in response.request.meta:
            cur_url = response.request.meta["redirect_urls"][0]
        else:
            cur_url = response.url
        index = self.products["urls"].index(cur_url)
        try:
            item["product_id"] = [self.products["product_ids"][index]]
            item["name"], item["brand"] = self.get_basic_info(hxs)
            item["heading"], item["details"], item["specs"], item["call_to_action"] = self.get_description(hxs)
            item["brand_image"], item["brand_image_promo"], brand_images = self.get_description_images(hxs)
            item["old_price"], item["discount"], item["price"] = self.get_prices(hxs)
            item["image_json"], img = self.get_images(hxs)
            item["serial"] = self.get_serials(hxs)
            item["warranty"] = self.gold_coverage(hxs)
            item["in_stock"] = self.get_available(hxs)
            item["product_ref"], item["add_to_cart_id"] = self.get_add_to_cart(hxs)
            if not item["add_to_cart_id"]:
                item["in_stock"] = ["NOT_AVAILABLE"]
            item["shipping"] = self.get_shipping(hxs)
            item["colors"] = self.get_colors(hxs)
            self.products["status"][index] = "ran"
        except StandardError:
            self.products["status"][index] = "error"
            self.exc.code_handler(100, response.url)
        else:
            self.xml.create_xml(item)
            item["image_urls"] = img + brand_images
        return item

    def handle_not_provided(self):
        item = GuitarCenterItem()
        for n in self.no_urls["product_ids"]:
            item["product_id"] = [n]
            index = self.no_urls["product_ids"].index(n)
            item["name"] = [self.no_urls["names"][index]]
            item["in_stock"] = ["NOT_AVAILABLE"]
            self.xml.create_xml(item)

    def get_basic_info(self, hxs):
        name = hxs.select('//h1[@class="fn"]/text()').extract()
        name = [basic.clean_string("".join(name))]
        brand = hxs.select('//span[@class="brand"]/text()').extract()
        name = [name[0].replace(u"\xa0", "")]
        return name, brand

    def get_description_images(self, hxs):
        brand_image = hxs.select('//a[@class="brandImage"]/img/@src').extract()
        brand_image_promo = hxs.select('//div[@class="brandPromoLogo"]/img/@src').extract()
        images = brand_image + brand_image_promo
        if brand_image:
            brand_image = [self.get_server_path(brand_image[0])]
        if brand_image_promo:
            brand_image_promo = [self.get_server_path(brand_image_promo[0])]
        return brand_image, brand_image_promo, images

    def get_description(self, hxs):
        heading = hxs.select('//div[@id="description"]/p').extract()
        details = hxs.select('//p[@class="description"]').extract()
        specs = hxs.select('//div[@class="specs"]/ul').extract()
        last = hxs.select('//div[@class="callToAction"]/p/text()').extract()
        return basic.cdata_field(heading), basic.cdata_field(details), basic.cdata_field(specs), basic.cdata_field(last)

    # function for getting prices, returns tags and values or empty field if no option for one of them new is discount
    def get_prices(self, hxs):
        tag = hxs.select('//dl[@class="lineItemList"]/dt/text()').extract()
        value = hxs.select('//dl[@class="lineItemList"]/dd/text()').extract()
        old_price = []
        discount = []
        price = []
        if len(tag) > 1:
            old_price = [basic.clean_string(value[0])]
        try:
            discount = [basic.clean_string(value[len(value) - 1])]
        except IndexError:
            print "This product has no price."
        try:
            price = hxs.select('//span[@class="topAlignedPrice"]/text()').extract()
        except IndexError:
            print "This product has no price."
        if not old_price and not discount and not price:
            price = hxs.select('//dl[@class="inlineList"]/dd/text()').extract()
        return self.clean_price(old_price), self.clean_price(discount), self.clean_price(price)

    # returning json with image url and serial number of product image refers to
    def get_images(self, hxs):
        images = hxs.select('//ul[@id="prodDetailThumbs"]/li/a/@href').extract()
        tags = hxs.select('//ul[@id="prodDetailThumbs"]/li/@class').extract()
        images_list = []
        d = {}
        img = []
        for i in range(0, len(images)):
            d["image_url"] = self.get_server_path(images[i])
            img.append(images[i])
            if "site1sku" in tags[i]:
                d["product_serial"] = tags[i].replace("site1sku", "")
            else:
                d["product_serial"] = tags[i]
            images_list.append(basic.cdata(simplejson.dumps(d)))
        return images_list, img

    # function for getting serials and all information about them, currently returns field with jsons with all
    # information, can be modified to return dicts if needed for subproducts for those one day
    def get_serials(self, hxs):
        serials = hxs.select('//var[@class="hidden styleInfo"]/text()').extract()
        new = []
        for serial in serials:
            d = simplejson.loads(serial)
            new.append(basic.cdata(simplejson.dumps(d)))
        return new

    def get_server_path(self, url):
        # uncomment next line if you want to keep absolute image path from their site
        return url
        return IMAGES_STORE + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg"

    # function for getting gold coverage from the page which is actually additional warranty options
    def gold_coverage(self, hxs):
        ids = hxs.select('//div[@class="goldCoverage"]/input[@type="checkbox"]/@value').extract()
        labels = hxs.select('//div[@class="goldCoverage"]/label/text()').extract()
        d = {}
        new = []
        for i in range(0, len(ids)):
            d["id"] = ids[i]
            d["name"] = labels[i]
            new.append(basic.cdata(simplejson.dumps(d)))
        return new

    # function for getting availability
    def get_available(self, hxs):
        p = hxs.select('//var[@class="hidden availability"]/text()').extract()
        if p:
            if p[0] == "in_stock":
                p = [p[0].upper()]
        else:
            # for those that have color options and in stock status for each of those
            # put IN_STOCK for the product as it has no that option on the page
            p = ["IN_STOCK"]
        return p

    # function for getting add to cart id and product reference
    def get_add_to_cart(self, hxs):
        try:
            temp = hxs.select('//span[@class="magicLink addToList"]/@data-rel').extract()[0]
        except:
            print "Product not available"
        else:
            return [temp.split("|")[0]], [temp.split("|")[1]]
        return [], []

    # function for gatting shipping information
    def get_shipping(self, hxs):
        return hxs.select('//div[@id="targeter_pdpShipping"]/span/text()').extract()

    # function for getting colors, return jsons with all the data about options
    def get_colors(self, hxs):
        colors = hxs.select('//var[@class="styleInfo"]/text()').extract()
        new = []
        for color in colors:
            d = simplejson.loads(color)
            new.append(basic.cdata(simplejson.dumps(d)))
        return new

    # cleaning price to leave only numbers
    def clean_price(self, price):
        new = []
        for i in price:
            new.append(re.sub("[^0-9.]", "", i))
        return new

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}".format(datetime.now())
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total)
        # filename for writing xml
        if self.d["database"]:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d["catalog_id"])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d["file"]
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        if self.d["upload"]:
            exp = CommonExport()
            try:
                exp.xml_to_db(self.name, filename, "4a9f5955-9b8e-4e13-84ef-95f937dbc00d")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        ## part for exporting to database here
        from modules.mail import Mail

        mail = Mail()
        try:
            mail.send_mail(msg, "GuitarCenter: {0}".format(filename))
            if self.d["email"]:
                mail.send_mail(msg, "GuitarCenter: {0}".format(filename), self.d["email"])
        except:
            msg += "\nSending mail failed."
        if self.d["database"]:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), "w") as f:
                f.write(msg)

    def add_properties(self, xml):
        xml.add_property("old_price", "Old Price", "decimal")
        xml.add_property("image_json", "Image Json", "text_list")
        xml.add_property("discount", "Discount", "decimal")
        xml.add_property("product_ref", "Product Ref.", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("serial", "Serial", "text_list")
        xml.add_property("colors", "Colors", "text_list")
        xml.add_property("add_to_cart_id", "Add To Cart ID", "text")
        xml.add_property("shipping", "Shipping", "text")
        xml.add_property("warranty", "Warranty", "text_list")
        xml.add_property("heading", "Heading", "text")
        xml.add_property("details", "Details", "text")
        xml.add_property("specs", "Specs", "text")
        xml.add_property("call_to_action", "Call To Action", "text")
        xml.add_property("brand_image", "Brand Image", "text")
        xml.add_property("brand_image_promo", "Brand Image Promo", "text")

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d["file"]))
        self.products = dict()
        try:
            self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15)
            self.products["product_ids"] = xls.read_excel_collumn_for_ids(1, 15)
            self.products["names"] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d["file"])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d["file"])
            self.exc.code_handler(103, msg=msg)
        self.products = xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

Exemple #2

0

Afficher le fichier

class ExpressSpider(CrawlSpider):
    name = "express"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    temp_msg = ""
    handle_httpstatus_list = [404]
    counter = 0

    def __init__(self, *a, **kw):
        super(ExpressSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = CommonTerminal(sys.argv, self.name)
        self.log = Logger()
        self.d = terminal.get_arguments()
        self.xml = VariantsXml()
        self.exc = ZmagsException(5)
        shops = CreateShops(self.d['file'], self.xml)
        try:
            shops.get()
        except IndexError:
            print "This sheet has no shop look or line"
        self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.start_urls = self.url_list[:2]
        self.total = len(self.start_urls)

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = ExpressItem()
        index = self.url_list.index(response.url)
        self.url_list[index] = self.counter
        flag = 0
        shop_look = 0
        # main try that catches all unhandled errors
        try:
            hxs = HtmlXPathSelector(response)
            if response.url != "http://www.zmags.com/":
                error_404 = hxs.select('//img[@alt="404 Error Page Not Found"]').extract()
                flag = 1
                if not error_404:
                    flag = 1
                    available = hxs.select('//span[@class="glo-tex-error"]/text()').extract()
                    page = " ".join(hxs.select('//html').extract())
                    #part for creating main product in xml
                    id = self.get_product_id(hxs)[0]
                    if id != self.id_list[index]:
                        msg = "\nNot equal, id in sheet {0}, on site {1}".format(self.id_list[index], id)
                        self.temp_msg += msg
                    item['product_id'] = [id]
                    item['name'] = self.get_name(hxs)
                    item['description'], item['promo_text'] = self.get_basic_info(hxs)
                    item['master_price'], item['discount_price'] = self.get_product_prices(hxs)
                    item['shop_look'] = ['False']
                    item['normal'] = ['True']
                    item['shop_line'] = ['False']
                    item['in_stock'] = ["NOT_IN_STOCK"]
                    if available[0] != "This item is no longer available for purchase.":
                        item['category_id'], item['subcategory_id'] = self.get_categories(hxs)
                        item['add_to_cart_id'] = self.get_add_to_cart_id(hxs)
                        color_names, urls, swatch_image_names, jsons = self.get_swatch_images(hxs)
                        #urls = basic.cdata_field(self.map_url_to_server(urls, id, True))
                        item['color_image_url'] = self.create_color_json(urls, color_names)
                        item['in_stock'] = ["IN_STOCK"]
                        item['product_page'] = [response.url]
                        self.xml.create_xml(item)
                        product_images, images_grouped = self.parse_jsons(jsons, color_names)
                        ids, sizes, prices = self.get_variants(page)
                        # calling function that will handle creating all child products
                        self.create_child_products(id, ids, sizes, prices, images_grouped)
                        item['image_urls'] = urls + product_images
                        if self.shop_look_list[index]:
                            self.parse_for_shop_look(hxs, self.shop_look_list[index],
                                                     id, page, images_grouped, response.url, index)
                        if self.shop_line_list[index]:
                            self.parse_for_shop_look(hxs, self.shop_line_list[index],
                                                     id, page, images_grouped, response.url, index)
                    else:
                        self.xml.create_xml(item)
                        self.exc.code_handler(102, response.url)

                else:
                    self.exc.code_handler(104, response.url)
            else:
                basic.not_provided()
                self.exc.code_handler(101, response.url)
            if not flag:
                item['product_id'] = [self.id_list[index]]
                item['in_stock'] = ["NOT_AVAILABLE"]
                item['name'] = ["not available"]
                self.xml.create_xml(item)
        except StandardError:
            self.exc.code_handler(100, response.url)
        #if it's last product write xml and run end_operations
        return item

    def parse_for_shop_look(self, hxs, id, product_id, page, images_grouped, product_url, index):
        """Special parse function for shop looks and lines.
        It gets same info stored in different format, mostly json and reference
        to master product id that is actually shop look/line id.
        TO DO: see if there is need to specially handle the case
        for not available"""
        item = ExpressItem()
        item['master_product_id'] = [id]
        item['product_id'] = [id + "_" + product_id]
        if self.ordered:
            item['order_index'] = [self.order_list[index]]
        item['style'] = [product_id]
        item['product_page'] = [product_url]
        item['category_id'], item['subcategory_id'] = self.get_categories(hxs)
        item['add_to_cart_id'] = self.get_add_to_cart_id(hxs)
        # below is part fot creating swatch images and images json
        color_names, urls, swatch_image_names, jsons = self.get_swatch_images(hxs)
        i = 0
        colors = []
        for k in color_names:
            d = {'name': k, 'swatch_url': urls[i], 'image_url': self.get_absolute_url(images_grouped[k])}
            i += 1
            colors.append(simplejson.dumps(d))
        item['colors'] = basic.cdata_field(colors)
        item['price'], item['discount_price'] = self.get_product_prices(hxs)
        item['description'], item['promo_text'] = self.get_basic_info(hxs)
        item['name'] = self.get_name(hxs)
        # below is part for creating variants json
        ids, sizes, prices = self.get_variants(page)
        variants = []
        for k in ids:
            d = {'color': k, 'prices': prices[k], 'ids': ids[k]}
            try:
                d['sizes'] = sizes[k]
            except StandardError:
                print "This product has no sizes"
            variants.append(simplejson.dumps(d))
        item['variants'] = basic.cdata_field(variants)
        self.xml.create_xml(item)

    def parse_shop_look(self, hxs):
        products = hxs.select('//div[@id="cat-ens-prod-item"]')
        i = 0
        # do this with actual id
        item = ExpressItem()
        whole_page = hxs.extract()
        whole_page = "".join(whole_page)
        ensemble_id = basic.get_middle_text(whole_page, "ensembleId: '", "',")
        name = hxs.select('//div[@id="cat-ens-prod-con"]/h1/text()').extract()
        name = basic.clean_string_field(name)
        item['ensemble_id'] = ensemble_id
        item['normal_image_url'] = self.shl_get_image(hxs)
        item['product_id'] = ["DUMMIE1"]
        item['shop_look'] = ['True']
        item['normal'] = ['False']
        item['shop_line'] = ['False']
        item['in_stock'] = ['IN_STOCK']
        item['name'] = name
        xml.create_xml(item)
        item.clear()
        for p in products:
            i += 1
            item = ExpressItem()
            item['master_product_id'] = ['DUMMIE1']
            item['product_id'] = ["DUMMIE1_" + str(i)]
            item['name'], item['price'], item['style'] = self.shl_basic_info(p)
            page = p.extract()
            item['variants'] = basic.cdata_field([self.shl_create_variants(self.get_variants(page))])
            item['colors'] = basic.cdata_field(self.shl_get_swatches(p))
            xml.create_xml(item)
        # return images for download here once it's needed

    def get_categories(self, hxs):
        category_id = hxs.select('//input[@name="categoryId"]/@value').extract()
        sub_category_id = hxs.select('//input[@name="subCategoryId"]/@value').extract()
        return category_id, sub_category_id

    def get_add_to_cart_id(self, hxs):
        return hxs.select('//input[@name="productId"]/@value').extract()

    def shl_get_image(self, hxs):
        page = hxs.extract()
        image = basic.get_middle_text(page, 'imagesets = "', '";')
        image = "http://t.express.com/com/scene7/s7d5/=/is/image/expressfashion/%s/i81" % (image[0])
        return [image]

    def shl_create_variants(self, f):
        """Creates variants for shop look products.
        Stored in dict with all info and returned as json"""
        d_main = {}
        n = []
        colors = [p for p in f[0]]
        for c in colors:
            d = {'color': c, 'ids': f[0][c]}
            try:
                d['sizes'] = f[1][c]
            except StandardError:
                print "This product has no sizes"
            d['prices'] = f[2][c]
            n.append(d)
        d_main['variants'] = n
        return simplejson.dumps(n)

    def shl_get_swatches(self, hxs):
        """Function for getting swatches for shop look way.
        Stores information in dict (name, swatch_url and image url)"""
        p = hxs.select('div[@class="cat-ens-prod-info"]/div[@class="cat-ens-prod-swatch-display"]')
        p = p.select('span/text()').extract()
        l = []
        d = {}
        for c in p:
            temp = c.split(",")
            d['name'] = temp[0]
            d['swatch_url'] = temp[1]
            d['image_url'] = temp[2]
            l.append(simplejson.dumps(d))
        return l

    def shl_basic_info(self, hxs):
        name = hxs.select('div[@class="cat-ens-prod-info"]/h1/text()').extract()
        name = basic.clean_string_field(name)
        price = hxs.select('div[@class="cat-ens-prod-info"]/span/text()').extract()
        price = basic.clean_spaces_field(basic.clean_string_field(price))
        style = hxs.select('div[@class="cat-ens-prod-info"]/text()').extract()
        if len(style) > 2:
            style = [basic.clean_string(style[1])]
        else:
            style = []
        return name, price, style

    def create_color_json(self, urls, names):
        d = {}
        n = []
        for i in range(0, len(urls)):
            d['url'] = urls[i]
            d['name'] = names[i]
            n.append(simplejson.dumps(d))
        return n

    def get_basic_info(self, hxs):
        """Gets basic info about products.
        Returns description and promo text"""
        description = hxs.select('//li[@class="cat-pro-desc"]').extract()[0]
        description = basic.clean_string(description)
        description = [basic.cdata(description)]
        promo_text = hxs.select('//span[@class="cat-pro-promo-text"]/text()').extract()
        if not promo_text:
            promo_text = hxs.select('//span[@class="cat-pro-promo-text"]/font').extract()
        if promo_text:
            promo_text = basic.cdata_field(promo_text)
        return description, promo_text

    def get_name(self, hxs):
        name = hxs.select('//div[@id="cat-pro-con-detail"]/h1/text()').extract()[0]
        name = [basic.clean_string(name)]
        return name

    def get_product_prices(self, hxs):
        """Gets product prices, regular and discount if it exists.
        If no discount returns empty field."""
        price = hxs.select('//li[@class="cat-pro-price"]/strong/text()').extract()
        discount_price = []
        if not price:
            price = hxs.select('//li[@class="cat-pro-price"]/span[@class="cat-glo-tex-oldP"]/text()').extract()
            discount_price = hxs.select('//li[@class="cat-pro-price"]/span[@class="cat-glo-tex-saleP"]/text()').extract()
        if discount_price:
            discount_price = [re.sub('[^0-9.,]', '', discount_price[0])]
        price = [re.sub('[^0-9.,]', '', price[0])]
        return price, discount_price

    def get_product_id(self, hxs):
        """Gets product sku from the page as a field"""
        sku = hxs.select('//input[@name="omnitureStyleID"]/@value').extract()[0]
        sku = sku.replace(";", "")
        return [sku]

    def get_swatch_images(self, hxs):
        """Function for getting swatch images info (names, urls, image names and urls).
        Also it gets and json as list of json urls for getting images set for every color."""
        urls = hxs.select('//li[@id="widget-product-swatches"]/a/img/@src').extract()
        color_names = hxs.select('//li[@id="widget-product-swatches"]/a/img/@alt').extract()
        swatch_image_names = self.get_swatch_image_name(urls)
        if not swatch_image_names and not color_names:
            color_names.append("no_color")
            swatch_image_names = self.get_imagesets(hxs)
        jsons = self.get_json(swatch_image_names)
        return color_names, urls, swatch_image_names, jsons

    def get_imagesets(self, hxs):
        """Function for getting image set in case where there is no color for product.
        Gets image set info from the javascript on the page and selects only first one,
        if there is more because there is only one color to associate with (no_color)"""
        page = hxs.extract()
        print len(page)
        iset = basic.get_middle_text(page, 'imagesets = "', '"; //Change')
        iset = iset[0].split(',')
        return [iset[0]]

    def get_swatch_image_name(self, image_sites):
        """Gets swatch image name from swatch image url"""
        image_names = []
        for x in range(0, len(image_sites)):
            name = basic.get_middle_text(image_sites[x], "fashion/", "_s")[0]
            image_names.append(name)
        return image_names

    def get_json(self, image_names):
        """Gets list of jsons from list of swatch images names"""
        jsons = []
        for i in range(0, len(image_names)):
            json = "http://s7d5.scene7.com/is/image/expressfashion/" + image_names[i] + "?req=imageset,json"
            jsons.append(json)
        return jsons

    def parse_jsons(self, jsons, color_names):
        """Parsing json from json urls.
        Returning all images in field, also returns them grouped by colors,
        so those groups can be used later when creating child products in xml"""
        images = []
        images_grouped = {}
        for i in range(0, len(jsons)):
            json = urllib2.urlopen(jsons[i]).read()
            image = basic.get_middle_text(json, '"expressfashion/', ";")
            rest_of_images = basic.get_middle_text(json, ',expressfashion/', ";")
            temp = image + rest_of_images
            images_grouped = basic.add_to_dict(images_grouped, color_names[i], temp)
            images += temp
        return self.get_absolute_url(images), images_grouped

    def get_absolute_url(self, images):
        """Gets absolute path for images.
        Receives field of relative path images and returns absolute paths"""
        image_urls = []
        for x in range(0, len(images)):
            image_url = "http://s7d5.scene7.com/is/image/expressfashion/" + images[x]
            image_url += "?width=351"
            image_urls.append(image_url)
        return image_urls

    def get_variants(self, page):
        """Getting variants from javascript on the page.
        Returns three dicts ids, sizes and prices. Format of the dicts is like
        (key = color, value = field of (ids, sizes and prices))"""
        temp = page.split("// Load the product variants")[1]
        temp = temp.split("// Set the field to update with the product variant")[0]
        variants = temp.split("// Create the variant")
        sizes = {}
        ids = {}
        prices = {}
        for i in range(1, len(variants)):
            color = basic.get_middle_text(variants[i], "Color','", "')")
            if color:
                color = color[0]
            else:
                color = "no_color"
            ids = basic.add_to_dict(ids, color, basic.get_middle_text(variants[i], "setId('", "')")[0])
            if variants[i].find("Size','") != -1:
                sizes = basic.add_to_dict(sizes, color, basic.get_middle_text(variants[i], "Size','", "')")[0])
            prices = basic.add_to_dict(prices, color, basic.get_middle_text(variants[i], 'numericPrice="', '"')[0])
        return ids, sizes, prices

    def get_image_url(self, images, is_swatch=False):
        """Returns path for images on our servers.
        If it's for swatch it return also swatch paths."""
        image_paths = []
        thumb_paths = []
        for x in range(0, len(images)):
            path = normal_image_url + images[x] + ".jpg"
            thumb_path = thumb_image_url + images[x] + ".jpg"
            image_paths.append(path)
            thumb_paths.append(thumb_path)
        if is_swatch is True:
            return image_paths
        else:
            return image_paths, thumb_paths

    def create_child_products(self, main_id, ids, sizes, prices, images_grouped):
        """Creating child products (both colors and sizes).
        Arguments it gets are: main_id as product id of the master product,
        images_grouped that is a dict of images grouped by color (field i field)
        and dicts ids, sizes and prices (e.g. dict with color names as keys and
        fields of ids for it as values 'black': ['32854, '32855''])"""
        item = ExpressItem()
        i = 0
        for k in ids:
            cur_id = main_id + "_" + chr(i + 97)
            item['product_id'] = [cur_id]
            item['master_product_id'] = [main_id]
            item['color'] = [k]
            # use this for some other path (our server)
#            images, thumbs = self.get_image_url(images_grouped[i])
            if images_grouped:
                images = self.get_absolute_url(images_grouped[k])
    #            item['normal_image_url'], item['thumb_image_url'] = self.map_url_to_server(images,main_id)
                item['normal_image_url'] = basic.cdata_field(self.map_url_to_server(images, main_id))
            self.xml.create_xml(item)
            item.clear()
            j = 0
            for val in ids[k]:
                item['product_id'] = [cur_id + "_" + chr(j + 97)]
                item['master_product_id'] = [cur_id]
                if len(sizes):
                    item['size'] = [sizes[k][j]]
                item['size_option_id'] = [ids[k][j]]
                item['price'] = [prices[k][j]]
                self.xml.create_xml(item)
                j += 1
            i += 1

    def map_url_to_server(self, urls, main_id, is_swatch=False):
        return urls
        new = []
        new1 = []
        for i in range(0, len(urls)):
            new.append(image_path + "/" + main_id + "/full/" + hashlib.sha1(urls[i]).hexdigest() + ".jpg")
            if is_swatch is False:
                new1.append(image_path + "/" + main_id + "/thumb/" + hashlib.sha1(urls[i]).hexdigest() + ".jpg")
        if is_swatch is True:
            return new
        else:
            return new, new1

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = ""
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total)
        # filename for writing xml
        self.xml.write_xml(self.name, self.d['file'])
        msg += self.exc.create_message(self.counter)
        msg += "\n{0}".format(self.temp_msg)
        exp = CommonExport()
        # part for exporting to database here
        if self.d['upload']:
            try:
                exp.xml_to_db(self.name, self.d['file'], "e2b3b658-16d5-4059-a9df-3c212c817d2c")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        msg += self.log.get_message()
        from modules.mail import Mail
        mail = Mail()
        mail.send_mail(msg, "Express scraper report")

    def get_lists_from_excel(self):
        xls = CommonExcel(basic.get_excel_path(self.name, self.d['file']))
        self.ordered = True
        try:
            self.url_list = xls.read_excel_collumn_for_urls(4, 1)
            self.id_list = xls.read_excel_collumn_for_ids(0, 1)
            self.shop_look_list = xls.read_excel_collumn(1, 1)
            self.shop_line_list = xls.read_excel_collumn(2, 1)
            try:
                self.order_list = xls.read_excel_collumn_for_ids(6, 1)
            except:
                self.ordered = False
                self.log.add_message("No order provided in this sheet.")
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)

    def add_properties(self, xml):
        xml.add_property("size_option_id", "Size Option Id", "text")
        xml.add_property("color_image_url", "Color Image Url", "text_list")
        xml.add_property("colors", "Colors", "text_list")
        xml.add_property("variants", "Variants", "text_list")
        xml.add_property("style", "Style", "text")
        xml.add_property("mode", "Mode", "text")
        xml.add_property("shop_look", "Shop look", "boolean")
        xml.add_property("shop_line", "Shop line", "boolean")
        xml.add_property("normal", "Normal", "boolean")
        xml.add_property("ensemble_id", "Ensemble ID", "text")
        xml.add_property("promo_text", "Promo text", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("product_page", "Product page", "text")
        xml.add_property("master_price", "Master Price", "decimal")
        xml.add_property("subcategory_id", "Sub Category ID", "text")
        xml.add_property("add_to_cart_id", "Add to cart ID", "text")
        xml.add_property("order_index", "Order Index", "integer")

Exemple #3

0

Afficher le fichier

Fichier : sportman_spider.py Projet : marjevtic/testMarko

class SportmanSpider(CrawlSpider):
    name = "sportman"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(SportmanSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5, "Sportmann")

        if self.d["database"]:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d["catalog_id"], self.d["product_id"])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.start_urls = self.products["urls"]
        self.images_store = "/" + settings["IMAGES_STORE"]
        self.total = len(self.start_urls)

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = SportmanItem()
        if "redirect_urls" in response.request.meta:
            cur_url = response.request.meta["redirect_urls"][0]
        else:
            cur_url = response.url
        index = self.products["urls"].index(cur_url)
        try:
            if "redirect_urls" in response.request.meta:
                item["product_id"] = [self.products["product_ids"][index]]
                item["name"] = [self.products["names"][index]]
                item["in_stock"] = ["NOT_AVAILABLE"]
                self.exc.code_handler(102, response.url)
                self.xml.create_xml(item)
                self.products["status"][index] = "no_avail"
            else:
                item["name"], item["short_desc"], item["description"], item["old_price"], item["custom_price"], item[
                    "product_id"
                ], item["sku"] = self.get_basic_info(hxs)
                item["in_stock"] = ["IN_STOCK"]
                viewstate, eventval, prevpage, hidden, view_page, even_page, pre_page, hidd_page = self.get_vars(
                    response, hxs
                )

                viewstate1 = viewstate[:2000]
                viewstate2 = viewstate[2000:4000]
                viewstate3 = viewstate[4000:6000]
                viewstate4 = viewstate[6000:8000]
                viewstate5 = viewstate[8000:10000]
                viewstate6 = viewstate[10000:]

                item["viewstate1"] = [basic.cdata(viewstate1)]
                item["viewstate2"] = [basic.cdata(viewstate2)]
                item["viewstate3"] = [basic.cdata(viewstate3)]
                item["viewstate4"] = [basic.cdata(viewstate4)]
                item["viewstate5"] = [basic.cdata(viewstate5)]
                item["viewstate6"] = [basic.cdata(viewstate6)]
                item["eventval"] = [basic.cdata(eventval)]
                item["size_options"] = self.get_variants(hxs, response)

                images_url = self.get_images(hxs)

                item["normal_image_url"] = self.get_server_path(images_url)

                self.xml.create_xml(item)
                item.clear()
                item["image_urls"] = self.get_images(hxs)
                self.products["status"][index] = "ran"
        except:
            self.exc.code_handler(100, response.url)
            self.products["status"][index] = "error"
        else:
            return item

    def get_basic_info(self, hxs):
        name = hxs.select('//div[@id="fragment-1"]/h2/text()').extract()

        short_desc = hxs.select('//div[@class="description2"]/text()').extract()

        description = hxs.select('//div[@id="fragment-1"]/div[@class="description"]').extract()
        description = sportman.delete_tags(re, description[0])
        description = [basic.cdata(description)]

        old_price = hxs.select('//span[@class="oldprice"]/text()').extract()
        if old_price != []:
            old_price = " ".join(old_price)
            old_price = old_price.split(":")
            old_price = old_price[1].replace("Kr", "")
            old_price = [old_price.replace(" ", "")]
        else:
            old_price = old_price

        price = hxs.select('//span[@class="nowprice"]/text()').extract()
        if price != []:
            price = " ".join(price)
            price = price.split(":")
            price = price[1].replace("Kr", "")
            price = [price.replace(" ", "")]
        else:
            price = hxs.select('//span[@class="normalprice"]/text()').extract()
            price = " ".join(price)
            price = price.split(":")
            price = price[1].replace("Kr", "")
            price = [price.replace(" ", "")]

        id = hxs.select('//div[@class="articlenumber"]').extract()
        id = " ".join(id)
        id = id.replace(u"\xa0", "")
        id = basic.get_middle_text(id, "Art.nr.", "</div>")
        sku = id
        id = [id[0]]

        return name, short_desc, description, old_price, price, id, sku

    def get_vars(self, response, hxs):
        headers1 = {
            "User-Agent": "Mozilla/5.0 (Windows NT 5.1; rv:13.0) Gecko/20100101 Firefox/13.0.1",
            "Host": "www.sportmann.no",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-us,en;q=0.5",
            "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7",
            "Connection": "keep-alive",
            "Referer": "/product.aspx?productid=613232",
            "Cookie": "ASP.NET_SessionId=lurvsvrn3jxsfd45cedmsv45; Besok=922884e3-e9cb-4b69-b8c8-215f3cc988a9; __utma=184084580.1353376623.1312483243.1312483243.1312483243.1; __utmb=184084580.9.10.1312483243; __utmc=184084580; __utmz=184084580.1312483243.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)",
        }

        page = hxs.select("//html").extract()
        page = " ".join(page)

        viewst = basic.get_middle_text(page, 'id="__VIEWSTATE" value="', '"')
        eventval = basic.get_middle_text(page, 'id="__EVENTVALIDATION" value="', '"')
        prevpage = [""]
        hidden_field = [""]

        r = requests.get(response.url, headers=headers1)

        page_one = r.content

        viewst_page = basic.get_middle_text(page_one, 'id="__VIEWSTATE" value="', '"')
        eventval_page = basic.get_middle_text(page_one, 'id="__EVENTVALIDATION" value="', '"')
        prevpage_page = basic.get_middle_text(page_one, 'id="__PREVIOUSPAGE" value="', '"')
        hidden_temp = page_one.split('id="__VIEWSTATE"')
        hidden_temp = hidden_temp[1].split('id="__PREVIOUSPAGE"')
        hidden_temp = hidden_temp[0].split("<script sr")

        val_x = len(hidden_temp) - 1

        hidden_temp = basic.get_middle_text(hidden_temp[val_x], 'c="', '"')
        hidden_temp_val = hidden_temp[0]
        hidden_temp_val = hidden_temp_val.replace("amp;", "")
        hidden_url = "http://www.sportmann.no" + hidden_temp_val

        request_hidden = urllib2.Request(hidden_url)
        response_hidden = urllib2.urlopen(request_hidden)
        hidden_field_page = basic.get_middle_text(
            response_hidden.read(), "ctl00_ScriptManager1_HiddenField').value += '", "';"
        )

        return (
            viewst[0],
            eventval[0],
            prevpage[0],
            hidden_field[0],
            viewst_page[0],
            eventval_page[0],
            prevpage_page[0],
            hidden_field_page[0],
        )

    def get_variants(self, hxs, response):
        page = hxs.select("//html").extract()
        page = " ".join(page)
        dict_one = {}
        test_one = []

        temp = page.split('<div class="color">')
        temp = temp[1].split("</div>")
        temp = temp[0].split("<select name")

        viewstate, eventvalidation, previouspage, hiddenfield, view_page, even_page, pre_page, hidd_page = self.get_vars(
            response, hxs
        )

        if len(temp) == 1:
            color = hxs.select('//div[@class="color"]/text()').extract()
            value = hxs.select('//input[@id="ctl00_ContentPlaceHolder1_Variant1Hidden"]/@value').extract()
            color[0] = color[0].replace("  ", "")
            color = basic.clean_string(color[0])
            value = value[0]

        #            color = basic.clean_string(color[0])
        #            color = color.replace("  ","")
        #
        #            dict['color'] = color
        #            dict['color_value'] = value[0]

        else:
            test_color = basic.get_middle_text(temp[1], "farge</option>", "</select>")
            color = basic.get_middle_text(test_color[0], '">', "</option>")
            value = basic.get_middle_text(test_color[0], 'value="', '">')

            for i in range(0, len(color)):
                color[i] = color[i].replace("  ", "")
            #
            #                dict['color'] = color
            #                dict['color_value'] = value

        size_temp = page.split('<div class="size">')
        size_temp = size_temp[1].split("</div>")
        size_temp = size_temp[0].split("<select name")

        if len(size_temp) == 1:
            size = hxs.select('//div[@class="size"]/text()').extract()
            size = basic.clean_string(size[0])
            size = [size.replace("   ", "")]

            size_val = hxs.select('//input[@id="ctl00_ContentPlaceHolder1_Variant2Hidden"]/@value').extract()

            if size[0] == "":
                for i in range(len(value)):
                    resp_page = self.get_data(response, hidd_page, view_page, pre_page, even_page, value[i])

                    a_page = resp_page.split('<div class="siz')
                    a_page = a_page[1].split("</select>")

                    if len(a_page) == 1:

                        size = basic.get_middle_text(a_page[0], 'e">', '<input type="hidden"')
                        size_val = basic.get_middle_text(a_page[0], 'value="', '"')
                        size_val = size_val[0]
                        size_val = [size_val]

                    else:
                        a_page = basic.get_middle_text(a_page[0], "se</option>", "</select>")
                        size = basic.get_middle_text(a_page[0], '">', "</option>")
                        size_val = basic.get_middle_text(a_page[0], 'value="', '">')

                    dict_one["color"] = color[i]
                    dict_one["color_value"] = value[i]
                    dict_one["size_value"] = size_val

                    for x in range(0, len(size)):
                        size[x] = basic.clean_string(size[x])
                        size[x] = size[x].replace("   ", "")

                        dict_one["size"] = size

                    test_one.append(basic.cdata(json.dumps(dict_one)))

            else:
                dict_one["color"] = color

                dict_one["color_value"] = value
                dict_one["size"] = size
                dict_one["size_value"] = size_val
                test_one.append(basic.cdata(simplejson.dumps(dict_one)))

        else:
            test_size = basic.get_middle_text(size_temp[1], "se</option>", "</select>")
            size = basic.get_middle_text(test_size[0], '">', "</option>")
            size_val = basic.get_middle_text(test_size[0], 'value="', '">')

            for x in range(0, len(size)):
                size[x] = basic.clean_string(size[x])
                size[x] = size[x].replace("   ", "")

            dict_one["color"] = color
            dict_one["color_value"] = value
            dict_one["size"] = size
            dict_one["size_value"] = size_val

            test_one.append(basic.cdata(json.dumps(dict_one)))

        return test_one

    def get_server_path(self, url):
        images_array = []
        for i in range(0, len(url)):
            url[i] = basic.clean_string(url[i])

            images_array.append(self.images_store + "/full/" + hashlib.sha1(url[i]).hexdigest() + ".jpg")

        return images_array

    def get_images(self, hxs):
        page = hxs.select("//html").extract()
        page = " ".join(page)

        images = []

        temp = page.split('class="gallery_demo_unstyled"')
        temp = temp[1].split('<div class="right_container">')
        temp = basic.get_middle_text(temp[0], 'src="', '"')

        for i in range(0, len(temp)):
            image_url = "http://www.sportmann.no" + temp[i]
            images.append(image_url)

        return images

    def get_data(self, response, hidden, viewstate, previouspage, eventvalidation, colorvalue):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",
            "Host": "www.sportmann.no",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-us,en;q=0.5",
            "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7",
            "Connection": "keep-alive",
            "Referer": "http://www.sportmann.no/product.aspx?productid=613232",
            "Cookie": "",
        }

        eventvalidation = urllib.urlencode({"__EVENTVALIDATION": eventvalidation})
        viewstate = urllib.urlencode({"__VIEWSTATE": viewstate})
        previouspage = urllib.urlencode({"__PREVIOUSPAGE": previouspage})
        hidden = urllib.urlencode({"ctl00_ScriptManager1_HiddenField": hidden})

        data = (
            "ctl00%24ScriptManager1=ctl00%24ContentPlaceHolder1%24dropdownPanel%7Cctl00%24ContentPlaceHolder1%24ddlVariant&"
            + hidden
            + "%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0&__EVENTTARGET=ctl00%24ContentPlaceHolder1%24ddlVariant&__EVENTARGUMENT=&__LASTFOCUS=&"
            + viewstate
            + "&"
            + previouspage
            + "&"
            + eventvalidation
            + "&ctl00%24ProductSearch%24txtProdSearch=&ctl00%24ProductSearch%24TextBoxWatermarkProdSearch_ClientState=&ctl00%24ContentPlaceHolder1%24ddlVariant="
            + colorvalue
            + "&ctl00%24ContentPlaceHolder1%24Variant1Hidden=&ctl00%24ContentPlaceHolder1%24Variant2Hidden=&ctl00%24ContentPlaceHolder1%24tbAmount=1&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtFriendsName=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceFriendsName_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtFriendsEmail=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceFriendsEmail_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtYourName=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceYourName_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtYourEmail=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceYourEmail_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtComment=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceComment_ClientState=&__ASYNCPOST=true&"
        )

        # r = requests.get(response.url, h)
        req = urllib2.Request(response.url, data, headers)

        resp_page = urllib2.urlopen(req).read()

        return resp_page

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}".format(datetime.now())
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total)
        # filename for writing xml
        if self.d["database"]:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d["catalog_id"])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d["file"]
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        if self.d["upload"]:
            exp = CommonExport()
            try:
                exp.xml_to_db(self.name, filename, "1ccd39a5-af4e-47cc-aebe-e0dede5b14d8")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        from modules.mail import Mail

        mail = Mail()
        try:
            mail.send_mail(msg, "Sportmann: {0}".format(filename))
            if self.d["email"]:
                mail.send_mail(msg, "Sportmann: {0}".format(filename), self.d["email"])
        except:
            msg += "\nSending mail failed."
        if self.d["database"]:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), "w") as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d["file"]))
        self.products = dict()
        try:
            self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15)
            self.products["product_ids"] = xls.read_excel_collumn_for_ids(1, 15)
            self.products["names"] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d["file"])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d["file"])
            self.exc.code_handler(103, msg=msg)
        self.products = xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

    def add_properties(self, xml):
        xml.add_property("short_desc", "Short Description", "text")
        xml.add_property("old_price", "Old Price", "text")
        xml.add_property("custom_price", "New Price", "text")
        xml.add_property("color_value", "Color Value", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("size_val", "Size Value", "text_list")
        xml.add_property("sku", "Sku", "text")
        xml.add_property("size_options", "Size_options", "text_list")
        xml.add_property("viewstate1", "Viewstate1", "text_list")
        xml.add_property("viewstate2", "Viewstate2", "text_list")
        xml.add_property("viewstate3", "Viewstate3", "text_list")
        xml.add_property("viewstate4", "Viewstate4", "text_list")
        xml.add_property("viewstate5", "Viewstate5", "text_list")
        xml.add_property("viewstate6", "Viewstate6", "text_list")
        xml.add_property("eventval", "Eventval", "text_list")
        xml.add_property("hidden", "Hidden Field", "text_list")
        xml.add_property("prevpage", "Previous Page", "text_list")
        xml.add_property("recommended_product", "Recommended Product", "text_list")

Exemple #4

0

Afficher le fichier

class GuitarCenterSpider(CrawlSpider):
    name = "guitar_center"
    allowed_domains = ["musiciansfriend.com"]
    start_urls = ["http://www.musiciansfriend.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(GuitarCenterSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5)
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                        self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.handle_not_provided()
        self.start_urls = self.products['urls']
        self.total = len(self.products['urls'])

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = GuitarCenterItem()
        from scrapy.conf import settings
        if 'redirect_urls' in response.request.meta:
            cur_url = response.request.meta['redirect_urls'][0]
        else:
            cur_url = response.url
        index = self.products['urls'].index(cur_url)
        try:
            item['product_id'] = [self.products['product_ids'][index]]
            item['name'], item['brand'] = self.get_basic_info(hxs)
            item['heading'], item['details'], item['specs'], item['call_to_action'] = self.get_description(hxs)
            item['brand_image'], item['brand_image_promo'], brand_images = self.get_description_images(hxs)
            item['old_price'], item['discount'], item['price'] = self.get_prices(hxs)
            item['image_json'], img = self.get_images(hxs)
            item['serial'] = self.get_serials(hxs)
            item['warranty'] = self.gold_coverage(hxs)
            item['in_stock'] = self.get_available(hxs)
            item['product_ref'], item['add_to_cart_id'] = self.get_add_to_cart(hxs)
            if not item['add_to_cart_id']:
                item['in_stock'] = ["NOT_AVAILABLE"]
            item['shipping'] = self.get_shipping(hxs)
            item['colors'] = self.get_colors(hxs)
            self.products['status'][index] = "ran"
        except StandardError:
            self.products['status'][index] = "error"
            self.exc.code_handler(100, response.url)
        else:
            self.xml.create_xml(item)
            item['image_urls'] = img + brand_images
        return item

    def handle_not_provided(self):
        item = GuitarCenterItem()
        for n in self.no_urls['product_ids']:
            item['product_id'] = [n]
            index = self.no_urls['product_ids'].index(n)
            item['name'] = [self.no_urls['names'][index]]
            item['in_stock'] = ['NOT_AVAILABLE']
            self.xml.create_xml(item)

    def get_basic_info(self, hxs):
        name = hxs.select('//h1[@class="fn"]/text()').extract()
        name = [basic.clean_string("".join(name))]
        brand = hxs.select('//span[@class="brand"]/text()').extract()
        name = [name[0].replace(u"\xa0", "")]
        return name, brand

    def get_description_images(self, hxs):
        brand_image = hxs.select('//a[@class="brandImage"]/img/@src').extract()
        brand_image_promo = hxs.select('//div[@class="brandPromoLogo"]/img/@src').extract()
        images = brand_image + brand_image_promo
        if brand_image:
            brand_image = [self.get_server_path(brand_image[0])]
        if brand_image_promo:
            brand_image_promo = [self.get_server_path(brand_image_promo[0])]
        return brand_image, brand_image_promo, images

    def get_description(self, hxs):
        heading = hxs.select('//div[@id="description"]/p').extract()
        details = hxs.select('//p[@class="description"]').extract()
        specs = hxs.select('//div[@class="specs"]/ul').extract()
        last = hxs.select('//div[@class="callToAction"]/p/text()').extract()
        return basic.cdata_field(heading), basic.cdata_field(details), basic.cdata_field(specs), basic.cdata_field(last)

    #function for getting prices, returns tags and values or empty field if no option for one of them new is discount
    def get_prices(self, hxs):
        tag = hxs.select('//dl[@class="lineItemList"]/dt/text()').extract()
        value = hxs.select('//dl[@class="lineItemList"]/dd/text()').extract()
        old_price = []
        discount = []
        price = []
        if len(tag) > 1:
            old_price = [basic.clean_string(value[0])]
        try:
            discount = [basic.clean_string(value[len(value) - 1])]
        except IndexError:
            print "This product has no price."
        try:
            price = hxs.select('//span[@class="topAlignedPrice"]/text()').extract()
        except IndexError:
            print "This product has no price."
        if not old_price and not discount and not price:
            price = hxs.select('//dl[@class="inlineList"]/dd/text()').extract()
        return self.clean_price(old_price), self.clean_price(discount), self.clean_price(price)

    # returning json with image url and serial number of product image refers to
    def get_images(self, hxs):
        images = hxs.select('//ul[@id="prodDetailThumbs"]/li/a/@href').extract()
        tags = hxs.select('//ul[@id="prodDetailThumbs"]/li/@class').extract()
        images_list = []
        d = {}
        img = []
        for i in range(0, len(images)):
            d['image_url'] = self.get_server_path(images[i])
            img.append(images[i])
            if "site1sku" in tags[i]:
                d['product_serial'] = tags[i].replace("site1sku", "")
            else:
                d['product_serial'] = tags[i]
            images_list.append(basic.cdata(simplejson.dumps(d)))
        return images_list, img

    # function for getting serials and all information about them, currently returns field with jsons with all
    # information, can be modified to return dicts if needed for subproducts for those one day
    def get_serials(self, hxs):
        serials = hxs.select('//var[@class="hidden styleInfo"]/text()').extract()
        new = []
        for serial in serials:
            d = simplejson.loads(serial)
            new.append(basic.cdata(simplejson.dumps(d)))
        return new

    def get_server_path(self, url):
        #uncomment next line if you want to keep absolute image path from their site
        return url
        return IMAGES_STORE + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg"

    # function for getting gold coverage from the page which is actually additional warranty options
    def gold_coverage(self, hxs):
        ids = hxs.select('//div[@class="goldCoverage"]/input[@type="checkbox"]/@value').extract()
        labels = hxs.select('//div[@class="goldCoverage"]/label/text()').extract()
        d = {}
        new = []
        for i in range(0, len(ids)):
            d['id'] = ids[i]
            d['name'] = labels[i]
            new.append(basic.cdata(simplejson.dumps(d)))
        return new

    # function for getting availability
    def get_available(self, hxs):
        p = hxs.select('//var[@class="hidden availability"]/text()').extract()
        if p:
            if p[0] == "in_stock":
                p = [p[0].upper()]
        else:
            #for those that have color options and in stock status for each of those
            #put IN_STOCK for the product as it has no that option on the page
            p = ["IN_STOCK"]
        return p

    # function for getting add to cart id and product reference
    def get_add_to_cart(self, hxs):
        try:
            temp = hxs.select('//span[@class="magicLink addToList"]/@data-rel').extract()[0]
        except:
            print "Product not available"
        else:
            return [temp.split("|")[0]], [temp.split("|")[1]]
        return [], []

    # function for gatting shipping information
    def get_shipping(self, hxs):
        return hxs.select('//div[@id="targeter_pdpShipping"]/span/text()').extract()

    # function for getting colors, return jsons with all the data about options
    def get_colors(self, hxs):
        colors = hxs.select('//var[@class="styleInfo"]/text()').extract()
        new = []
        for color in colors:
            d = simplejson.loads(color)
            new.append(basic.cdata(simplejson.dumps(d)))
        return new

    # cleaning price to leave only numbers
    def clean_price(self, price):
        new = []
        for i in price:
            new.append(re.sub('[^0-9.]', '', i))
        return new

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}".format(datetime.now())
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        if self.d['upload']:
            exp = CommonExport()
            try:
                exp.xml_to_db(self.name, filename, "4a9f5955-9b8e-4e13-84ef-95f937dbc00d")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        ## part for exporting to database here
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "GuitarCenter: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "GuitarCenter: {0}".format(filename), self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def add_properties(self, xml):
        xml.add_property("old_price", "Old Price", "decimal")
        xml.add_property("image_json", "Image Json", "text_list")
        xml.add_property("discount", "Discount", "decimal")
        xml.add_property("product_ref", "Product Ref.", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("serial", "Serial", "text_list")
        xml.add_property("colors", "Colors", "text_list")
        xml.add_property("add_to_cart_id", "Add To Cart ID", "text")
        xml.add_property("shipping", "Shipping", "text")
        xml.add_property("warranty", "Warranty", "text_list")
        xml.add_property("heading", "Heading", "text")
        xml.add_property("details", "Details", "text")
        xml.add_property("specs", "Specs", "text")
        xml.add_property("call_to_action", "Call To Action", "text")
        xml.add_property("brand_image", "Brand Image", "text")
        xml.add_property("brand_image_promo", "Brand Image Promo", "text")

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15)
            self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15)
            self.products['names'] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        self.products= xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

Exemple #5

0

Afficher le fichier

Fichier : lydias_spider.py Projet : marjevtic/testMarko

class LydiasSpider(CrawlSpider):
    name = "lydias"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(LydiasSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = VariantsXml()
        self.exc = ZmagsException(5)
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                        self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        # fix for bug with links they provide
        self.products['urls'] = basic.cut_string_field(self.products['urls'], "&cat=")
        self.handle_not_provided()
        self.start_urls = self.products['urls']
        self.images_store = "/" + settings['IMAGES_STORE']
        lydias.add_properties(self.xml)
        self.total = len(self.products['urls'])

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = LydiasItem()
        if 'redirect_urls' in response.request.meta:
            cur_url = response.request.meta['redirect_urls'][0]
        else:
            cur_url = response.url
        index = self.products['urls'].index(cur_url)
        id = self.products['product_ids'][index]
        try:
            available = hxs.select('//div[@id="searchfor"]/text()').extract()
            if not available:
                item['product_id'] = [id]
                item['name'], item['price'], item['old_price'], item['description'] = self.get_basic_info(hxs)
                item['rating'], item['custom_rating'] = self.get_rating(hxs)
                chart = self.absolute_path(self.get_size_image(hxs))
                item['sizes_chart_image_url'] = self.get_server_path(chart)
                color_urls, color_names, product_image, color_codes = self.get_image_swatches(hxs)
                color_urls = self.absolute_path(color_urls)
                item['color_image_url'] = self.make_colors_json(color_urls, color_names, color_codes)
                item['in_stock'] = ["IN_STOCK"]
                item['embroidery'] = self.get_embroidery(hxs)
                default_images = self.absolute_path(self.get_extra_images(hxs))
                item['default_image_url'] = self.get_server_path(default_images)
                self.xml.create_xml(item)
                product_image = self.absolute_path(product_image)
                self.create_subproducts(id, color_names, product_image, color_codes, hxs)
                item['image_urls'] = product_image + color_urls + chart + default_images
                self.products['status'][index] = "ran"
            else:
                self.exc.code_handler(102, response.url)
                item['product_id'] = [id]
                item['in_stock'] = ["NOT_AVAILABLE"]
                self.products['status'][index] = "not_avail"
                self.xml.create_xml(item)
        except:
            self.products['status'][index] = "error"
            self.exc.code_handler(100, response.url)
        return item

     # function for checking if product has embroidery or not
    def get_embroidery(self, hxs):
        page = hxs.select('//html').extract()[0]
        if "document.getElementById('logocolor').disabled = true;" in page:
            return ["True"]
        else:
            return ["False"]

    # function for creating json with all information for colors
    def make_colors_json(self, color_urls, color_names, color_codes):
        dict = {}
        jsons = []
        for i in range(0, len(color_urls)):
            dict['color_url'] = self.get_server_path_single(color_urls[i])
            dict['color_name'] = color_names[i]
            dict['color_short'] = color_codes[i]
            json = basic.cdata(simplejson.dumps(dict))
            jsons.append(json)
        return jsons

    # function for getting image server path
    def get_server_path_single(self, url):
#        return url
        return self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg"

    # function for getting image path for field of images
    def get_server_path(self, urls):
#        return urls
        new = []
        for url in urls:
            new.append(self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg")
        return new

    #function for getting basic information for product
    def get_basic_info(self, hxs):
        name = hxs.select('//div[@id="proddetail"]/h1/text()').extract()
        price = hxs.select('//div[@id="proddetail"]/div[@class="yourprice bigprice"]/text()').extract()
        description = basic.cdata(hxs.select('//div[@id="details"]').extract()[0])
        description = basic.clean_string(description)
        old_price = hxs.select('//span[@class="yourprice_product"]/text()').extract()
        if not price:
            price = hxs.select('//span[@id="PriceDisplay"]/text()').extract()
        if old_price:
            old_price = [re.sub('[^0-9.]', '', old_price[0])]
        price = [re.sub('[^0-9.]', '', price[0])]
        return name, price, old_price, [description]

    # function for getting rating, both number and sentence (e.g. Rating 5 out of 6 votes)
    def get_rating(self, hxs):
        temp = hxs.select('//div[@id="Customerssay"]/p[2]/text()').extract()
        if temp:
            rating = basic.get_middle_text(temp[0].replace(" ", ""), "Rating:", "out")
            return rating, temp
        else:
            return [], temp

    #function for getting reviews, returning rating and field of json reviews
    # or empty fields if there's no reviews
    def get_reviews(self, hxs):
        reviews = hxs.select('//div[@class="prodReview"]')
        if reviews:
            title = reviews[0].select('p[@class="review_title"]/text()').extract()
            text = reviews[0].select('p[@class="review_text"]/text()').extract()
            author = reviews[0].select('p[@class="review_author"]/text()').extract()
            location = reviews[0].select('p[@class="review_location"]/text()').extract()
            jsons = self.make_reviews_json(title, text, author, location)
            return jsons
        else:
            return []

    # function for making json for reviews
    # currently not in use. cause there are no reviews in DPW design
    def make_reviews_json(self, title, text, author, location):
        jsons = []
        print len(title)
        print len(text)
        print len(author)
        print len(location)
        os._exit(0)
        for i in range(0, len(title)):
            json = '{ "title" : " %s ", "text" : "%s", "author" : "%s", "location" :\
                    "%s" }' % (title[i], text[i], author[i], location[i])
            json = basic.cdata(json)
            jsons.append(json)
        return jsons

    #function for getting size chart image
    def get_size_image(self, hxs):
        temp = hxs.select('//div[@class="TabbedPanelsContent cells"]/img/@src').extract()
        return temp

    #function for getting image swatches, returning fields (image_urls, image name, product color image)
    def get_image_swatches(self, hxs):
        colors = hxs.select('//div[@class="lolite"]')
        color_images = []
        color_names = []
        products_image = []
        color_codes = []
        for color in colors:
            color_images.append(color.select('a/img/@src').extract()[0])
            color_names.append(color.select('a/img/@alt').extract()[0])
            #if zoom image needed, this is the place to get it
            products_image.append(color.select('a/@rev').extract()[0])
            color_codes.append(color.select('a/@onclick').extract()[0].split(",")[1].replace("'", ""))
        return color_images, color_names, products_image, color_codes

    #function for getting additional images, returns field of images or empty field if there is no
    def get_extra_images(self, hxs):
        additional_images = hxs.select('//div[@id="AddImg"]/script/text()').extract()
        if additional_images:
            temp = basic.get_middle_text(additional_images[0], '"', '"')
            thumb_images = temp[0].split(",")
            return thumb_images
        else:
            return []

    #function for getting product id from the page
    def get_product_id(self, hxs):
        temp = hxs.select('//div[@id="wrap"]/script/text()').extract()
        id = basic.get_middle_text(temp[0], 'productid","', '"')
        return id[0]

    # function for getting sizes from another url, retunrning field of jsons for sizes
    # one id from the page is 115NB, if needed here to hardcode for testing
    # currently not in use
    def get_sizes(self, id, hxs):
        showmode = hxs.select('//input[@name="showmode"]/@value').extract()[0]
        itemmode = hxs.select('//input[@name="itemmode"]/@value').extract()[0]
        salemode = hxs.select('//input[@name="salemode"]/@value').extract()[0]
        url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=AV&opt2=-1&type2=l1type" % (id)
        url += "&type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=429" % (showmode, itemmode, salemode)
        jsons = []
        print "reading page..."
        page = urllib2.urlopen(url).read()
        print "page read"
        page = page.replace("'", "")
        page = page.replace("[", ",")
        page = page.replace(",,", "")
        temp = page.split("]")
        for i in range(0, len(temp) - 2):
            tmp = temp[i].split(",")
            json = '{ "size_short" : " %s ", "size_full" : "%s", "some_number" :\
                    "%s", "some_id" : "%s" }' % (tmp[0], tmp[1], tmp[2], tmp[3])
            json = basic.cdata(json)
            jsons.append(json)
        return jsons

    # function that handles creating subproducts, can be implemented for the usual way product for every combination
    # of size and color if needed
    def create_subproducts(self, id, color_names, product_image, color_codes, hxs):
        item = LydiasItem()
        # if no colors for specific product do this part and call to creating size children with empty string instead
        # of actual color name
        if len(color_names) == 0:
            item['master_product_id'] = [id]
            item['product_id'] = [id + "_" + "0"]
            item['color'] = ["NO_COLOR"]
            item['custom_size'] = self.create_sizes_subproducts(id, id + "_" + "0", "", hxs)
            self.xml.create_xml(item)

        # for handling cases when there are color options for specific product, create child for every color, and call
        # for creating size children for every provided color
        else:
            for i in range(0, len(color_names)):
                print "name :" + color_names[i] + "  code:" + color_codes[i]
                item['master_product_id'] = [id]
                item['product_id'] = [id + "_" + str(i)]
                item['color'] = [color_names[i]]
                item['color_short'] = [color_codes[i]]
                item['normal_image_url'] = self.get_server_path([product_image[i]])
                item['in_stock'] = ["IN_STOCK"]
                item['custom_size'] = self.create_sizes_subproducts(id, id + "_" + str(i), color_codes[i], hxs)
                self.xml.create_xml(item)
                item.clear()
        return 0

    # function for creating child products for sizes
    # little messy with all the commented lines but those lines can be used if needed to go back to old way with
    # child products instead of json
    def create_sizes_subproducts(self, main_id, id, color_code, hxs):
        print color_code
        jsons = []
        # if block for cases when color is provided
        if color_code != "":
            showmode = hxs.select('//input[@name="showmode"]/@value').extract()[0]
            itemmode = hxs.select('//input[@name="itemmode"]/@value').extract()[0]
            salemode = hxs.select('//input[@name="salemode"]/@value').extract()[0]
            url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=%s&opt2=-1&type2=l1type&" \
                "type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=193" % (main_id, color_code, showmode, itemmode, salemode)
            page = urllib2.urlopen(url).read()
            page = page.replace("'", "")
            page = page.replace("[", ",")
            page = page.replace(",,", "")
            temp = page.split("]")
            for i in range(0, len(temp) - 2):
                tmp = temp[i].split(",")
                item = {}
#                item['master_product_id'] = [id]
                item['size_short'] = tmp[0]
                item['price_url'] = self.get_size_price(str(main_id), str(color_code), tmp[0])
                item['size'] = tmp[1]
#                item['product_id'] = [id + "_" + str(i)]
#                item['in_stock'] = ["IN_STOCK"]
#                xml.create_xml(item)
                jsons.append(basic.cdata(simplejson.dumps(item)))
            return jsons

        # when the color is not provided different block of code cause it's done differently on the page
        else:
            temp = hxs.select('//div[@class="not_size"]/text()').extract()
            for i in range(0, len(temp)):
                item = {}
#                item['master_product_id'] = [id]
#                item['product_id'] = [id + "_" + str(i)]
                item['size_short'] = temp[i]
                item['price_url'] = self.get_size_price(str(main_id), "", temp[i])
#                item['in_stock'] = ["IN_STOCK"]
#                xml.create_xml(item)
                jsons.append(basic.cdata(simplejson.dumps(item)))
            return jsons

#        return 0

    # function for getting price for combination of every size and color, can return url where the price is, or can
    # parse that url to get that actual price but will drastically increase scraping time
    def get_size_price(self, id, color, size):
        if color != "":
            url = "http://www.lydiasuniforms.com/ajaxed/product-showprice.asp?sku=%s %s %s&qty=1&itemmode=" \
                  "0&showmode=1&rnum=388" % (str(id), str(color), size)
        else:
            url = "http://www.lydiasuniforms.com/ajaxed/product-showprice.asp?sku=%s %s&qty=1&itemmode=" \
                  "0&showmode=1&rnum=259" % (id, size)
        url = url.replace(" ", "%20")
        return url

    # just adding part for getting absolute paths for relative paths from page
    def absolute_path(self, urls):
        new = []
        for i in urls:
            new.append("http://www.lydiasuniforms.com" + i)
        return new

    # function used for gettin embroidery information from clients page, was used only once to get it
    # cause embroidery is the same for all the products
    def get_emb(self, hxs):
        emb = hxs.select('//div[@id="emb"]').extract()
        lettering_colors = hxs.select('//select[@id="threadcolor"]/option/@value').extract()
        urls = []
        d = {}
        colors = []
        for i in range(1, len(lettering_colors)):
            d['type'] = "lettering colors"
            d['name'] = lettering_colors[i]
            url = "http://www.lydiasuniforms.com/images/lydias/threadcolor_"
            url += lettering_colors[i].lower().replace(' ', '_') + ".gif"
            d['url'] = self.get_server_path_single(url)

            urls.append(url)
            colors.append(basic.cdata(simplejson.dumps(d)))
        lettering = hxs.select('//select[@id="lettering"]/option/@value').extract()
        l = {}
        letterings = []
        for i in range(1, len(lettering)):
            l['type'] = "lettering"
            l['name'] = lettering[i]
            url = "http://www.lydiasuniforms.com/images/lydias/lettering_"
            url += lettering[i].lower().replace(' ', '_') + ".gif"
            l['url'] = self.get_server_path_single(url)
            letterings.append(basic.cdata(simplejson.dumps(l)))
            urls.append(url)
        logo = hxs.select('//select[@id="logoname"]/option/@value').extract()
        logos = {}
        log = []
        for i in range(1, len(logo)):
            logos['type'] = "logo"
            logos['name'] = logo[i]
            url = "http://www.lydiasuniforms.com/images/logos/"
            url += logo[i].lower() + ".jpg"
            logos['url'] = self.get_server_path_single(url)
            urls.append(url)
            log.append(basic.cdata(simplejson.dumps(logos)))
        item = LydiasItem()
        item['color'] = colors
        item['lettering'] = letterings
        item['log'] = log
        xml.create_xml(item)
        xml.write_xml("emb")

        return urls
        print  colors, letterings, log
        os._exit(0)

    def handle_not_provided(self):
        item = LydiasItem()
        for n in self.no_urls['product_ids']:
            item['product_id'] = [n]
            index = self.no_urls['product_ids'].index(n)
            item['name'] = [self.no_urls['names'][index]]
            item['in_stock'] = ['NOT_AVAILABLE']
            self.xml.create_xml(item)

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = ""
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        #if self.d['upload']:
            #exp = CommonExport()
            #try:
                #exp.xml_to_db(self.name, filename, "4b0d6b52-7b05-4e54-9d87-dfe77ac270c9")
                #msg += "\n\nExport to database successful"
            #except StandardError:
                #msg += "\n\nExport to database failed"
        #else:
            #msg += "\n\nUpload to database not selected"
        ## part for exporting to database here
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "Lydias: {0}".format(filename))
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15)
            self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15)
            self.products['names'] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        else:
            self.products = xls.delete_duplicates_dict(self.products)
            self.products, self.no_urls = xls.separate_no_urls(self.products)
            self.products = xls._add_none_status(self.products)
            self.no_urls = xls._add_none_status(self.no_urls)

Exemple #6

0

Afficher le fichier

Fichier : partylite_spider.py Projet : marjevtic/testMarko

class PartyliteSpider(CrawlSpider):
    name = "partylite"
    allowed_domains = ["partylite.biz"]
    start_urls = ["http://www.zmags.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(PartyliteSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = PartyliteTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.images_store = "/" + settings['IMAGES_STORE']
        self.users = party.get_users(settings, self.d)
        self.exc = ZmagsException(50)
        self.production = self.d['env']
        self.upload = self.d['upload']
        self.english = self.d['lang']
        self.file_name = self.d['file']
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                        self.d['product_id'])
            self.database.disconnect()
            self.change_url_list()
        else:
            self.get_lists_from_excel()
        self.xml = CommonXml()
        party.add_properties(self.xml)
        self.total = len(self.products['urls'])

    def parse(self, response):

        for url in self.products['urls']:

            if self.d['lang'] == 'us':
                request = Request(url, callback=self.parse_can, dont_filter=True)
                yield request

            elif self.d['lang'] == 'english':
                c_url = url.replace(self.users['us'], self.users['canada_en'])
                request = Request(c_url, callback=self.parse_can, dont_filter=True)
                request.meta['language'] = "eng"
                yield request

            elif self.d['lang'] == 'french':
                c_url = url.replace(self.users['us'], self.users['canada_fr'])
                request = Request(c_url, callback=self.parse_can, dont_filter=True)
                request.meta['language'] = "fr"
                yield request

    def change_url_list(self):
        for i in range(0, len(self.products['urls'])):
            if not self.production:
                self.products['urls'][i] = self.products['urls'][i].replace('www', 'qa')
            self.products['urls'][i] = self.products['urls'][i].replace('XXXXX', self.users['us'])

    def get_in_stock(self, hxs):
        """Gets in stock information about product."""
        stock = hxs.select('//div[@id="availability_container"]').extract()
        if not stock:
            return ["IN_STOCK"]
        else:
            return ["NOT_IN_STOCK"]

    def get_basic_info(self, hxs):
        """Getting basic info about products (name, shown with)."""
        name = hxs.select('//div[@id="product_name"]/text()').extract()
        if name:
            name = basic.cdata_field(name)
        shown_with = hxs.select('//div[@id="shown_with_container"]').extract()
        if shown_with:
            shown_with = [basic.cdata(shown_with[0])]
        return name, shown_with

    def get_description(self, hxs):
        description = description = hxs.select('//div[@id="item_description"]').extract()
        description = [basic.cdata(basic.remove_tags(description[0]))]
        description = [description[0].replace(u"\u2044", "/")]
        return description

    def get_price(self, hxs):
        """Getting product prices.
        Gets regular and discount price if there is one."""
        price = hxs.select('//span[@id="divUnitPrice"]/text()').extract()
        if not price:
            price = hxs.select('//div[@id="product_price"]/span[1]/text()').extract()
        if not price:
            price = hxs.select('//div[@id="product_price"]/text()').extract()
        discount = hxs.select('//div[@id="product_price"]/span[@class="pc-salePrice"]/text()').extract()
        price = basic.clean_string(price[0])
        price = re.sub(" +", " ", price)
        price = price.replace("Price:", "")
        price = price.replace("Prix:", "")
        price = basic.cdata(price.strip())
        if discount:
            discount = basic.cdata_field(discount)
        return [price], discount

    def get_add_to_cart_id(self, page):
        """Gets add to cart id from the javascript on the page."""
        tmp = basic.get_middle_text(page, "if(isOrderStarted){", "}else")[0]
        tmp = basic.get_middle_text(tmp, "addItemToCart(", ",")
        return tmp

    def create_subproducts(self, page):
        """Gets information about colors from javascript.
        Returns field of dicts with information about colors.
        Those are really color variants for product."""
        try:
            tmp = page.split("var largeImages = new Array();")[1]
        except IndexError:
            print "This product has no images"
        else:
            tmp = tmp.split("colorDropdownArray")[0]
            images = basic.get_middle_text(tmp, "ProductGroupProduct(", ");")
            image_names = self.get_image_names(page)
            color_products = []
            for im in images:
                product = {}
                attributes = im.split("',")
                product['normal_image_url'] = "http://qa.partylite.biz/imaging/resize?fileName=/productcatalog/production"
                product['normal_image_url'] += self.custom_clean_string(attributes[26], True)
                product['description'] = basic.cdata(self.custom_clean_string(attributes[27]))
                product['color_id'] = self.custom_clean_string(attributes[7], True)
                product['swatch_color'] = basic.cdata(self.custom_clean_string(attributes[9]).replace(" ", ""))
                product['name'] = basic.cdata(image_names[product['color_id']])
                product['add_to_cart_id'] = self.custom_clean_string(attributes[0], True).replace(" ", "")
                product['price'] = self.custom_clean_string(attributes[10], True)
                color_products.append(product)
            return color_products
        return []

    def custom_clean_string(self, string, spaces=False):
        """Custom function for cleaning strings.
        Replaces new line, return and tab signs, also replaces multiple spaces with only one."""
        string = string.replace("\r", "")
        string = string.replace("\n", "")
        string = string.replace("\t", "")
        if not spaces:
            string = re.sub(' +', ' ', string)
        else:
            string = re.sub(' ', '', string)
        string = string.replace("'", "")
        return string

    def get_image_names(self, page):
        """Gets color names for color swatches."""
        temp = page.split("new DropDownInfo")
        names = {}
        for i in range(1, len(temp)):
            names[basic.get_middle_text(temp[i], "('", "'")[0]] = basic.get_middle_text(temp[i], "'", "')")[2]
        return  names

    def get_recommended(self, hxs):
        """Gets recommended product information.
        Returns information about recommended products as dict"""
        rec = hxs.select('//div[@id="right_column_container"]/div')
        new = []
        i = 0
        for r in rec:
            d = {}
            #to do: see how to get full href(different accounts)
            if not i:
                d['link'] = r.select('div/a/@href').extract()[0]
                d['image'] = "http://www.partylite.biz/imaging/resize"
                d['image'] += r.select('div/a/img/@src').extract()[0]
                d['name'] = r.select('div/a/text()').extract()[0]
                new.append(basic.cdata(simplejson.dumps(d)))
            i += 1
        return  new

    def get_reviews(self, page):
        """Gets average product rating.
        Returns string like 4.6 of 5 reviews."""
        id = self.get_review_id(page)
        url = "http://partylite.ugc.bazaarvoice.com/8504-en_us/" + id + "/reviews.djs?format=embeddedhtml"
        url = url.replace(" ", "")
        page = urllib2.urlopen(url).read()
        page = basic.get_middle_text(page, '<div class=\\"BVRRRatingNormalImage\\">', '<\/div>')
        if page:
            rating = basic.get_middle_text(page[0], 'alt=\\"', '\\"')[0]
            return [rating]
        else:
            return []

    def get_more_images(self, page):
        """Gets field of images."""
        try:
            script = basic.get_middle_text(page, "var moreImages", "var numberOfImages")[0]
        except IndexError:
            print "This product has no images."
        else:
            r = basic.get_middle_text(script, "moreImages[", "';")
            images = []
            # return cdata here if needed to go with absolute links
            for i in range(0, len(r)):
                if self.production:
                    images.append("http://www.partylite.biz" + r[i].split("= '")[1])
                else:
                    images.append("http://qa.partylite.biz" + r[i].split("= '")[1])
            return images
        return []

    def get_absolute(self, relatives):
        """Creates absolute path for images. [DEPRECATED]
        Please check if there is a need for this function again.
        If needed dimensions of images got from the client server
        can be changed here."""
        new = []
        print relatives
        os._exit(0)
        for i in range(0, len(relatives)):
            #add width, height here for different dimensions
            #don't change the url in here from qa to www it's meant to be qa always
            new.append("http://www.partylite.biz/imaging/resize?fileName=/productcatalog/production" + relatives[i])
        return new

    def get_review_id(self, page):
        """Gets review id that is used in javascript for reviews."""
        return basic.get_middle_text(page, 'productId: "', '"')[0]

    def write_subproducts(self, id, list, xml):
        """Writes child products to xml.
        Receives id, list and xml attributes, id is master product id,
        list is list of child products and xml is Xml instance"""
        for i in range(0, len(list)):
            item = PartyliteItem()
            item['master_product_id'] = id
            item['product_id'] = [id[0] + "_" + str(i)]
            item['in_stock'] = ["IN_STOCK"]
            for k, v in list[i].iteritems():
                item[k] = [v]
            xml.create_xml(item)
        return 1

    def parse_can(self, response):
        """Parse function for scraping canadian sites.
        There is meta information send in request in this function about language."""
        self.counter += 1
        basic.print_status(self.counter, self.total)
        item = PartyliteItem()
        hxs = HtmlXPathSelector(response)
        image_urls = []
        if  'redirect_urls' in response.request.meta:
            item['product_id'] = [self.get_id(response.request.meta['redirect_urls'][0])[0]]
            self.exc.code_handler(102, response.request.meta['redirect_urls'])
            if 'language' in response.request.meta:
                item['product_id'] = [self.get_id(response.request.meta['redirect_urls'][0])[0]
                                      + "_can" + "_" + response.meta['language']]
            try:
                index = self.products['product_ids'].index(self.get_id
                                (response.request.meta['redirect_urls'][0])[0])
                item['name'] = [basic.cdata(item['product_id'][0]
                                + self.products['names'][index])]
                self.products['status'][index] = 'no_avail'
            except KeyError as e:
                print "This %s id is not in list" % (item['product_id'][0])
            item['in_stock'] = ['NOT_AVAILABLE']
            item['product_id'] = self.remove_spaces(item['product_id'])
            self.xml.create_xml(item)
        else:
            index = self.products['product_ids'].index(self.get_id(response.url)[0])
            try:
                item['product_id'] = self.get_id(response.url)
                item['name'], item['shown_with'] = self.get_basic_info(hxs)
                item['description'] = self.get_description(hxs)
                if 'language' in response.meta:
                    item['product_id'] = [item['product_id'][0] + "_can" + "_" + response.meta['language']]
                response.meta['item'] = item
                page = " ".join(hxs.select('//html').extract())
                image_urls = self.get_more_images(page)
                item['normal_image_url'] = self.get_server_path_field(image_urls)
                item['in_stock'] = self.get_in_stock(hxs)
                color_products = self.create_subproducts(page)
                if color_products:
                    self.write_subproducts(item['product_id'], color_products, xml)
                else:
                    item['add_to_cart_id'] = self.get_add_to_cart_id(page)
                    item['custom_price'], item['custom_discount'] = self.get_price(hxs)
                self.products['status'][index] = "ran"
            except StandardError:
                basic.print_error()
                self.products['status'][index] = "error"
                self.exc.code_handler(100, response.url)
            else:
                item['product_id'] = self.remove_spaces(item['product_id'])
                self.xml.create_xml(item)
        if image_urls:
            item['image_urls'] = image_urls
        return item

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = party.get_settings_message(self.d)
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        logname = filename
        filename = "{0}_{1}".format(filename, self.d['lang'])
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        from modules.mail import Mail
        from modules.export_to_db import CommonExport
        exp = CommonExport()
        if self.upload:
            try:
                if self.d['lang'] == 'us':
                    exp.xml_to_db(self.name, filename, "55892247-1b92-4ff9-a8a3-33cc976f9341")
                else:
                    exp.xml_to_db(self.name, filename, "9cb6c676-c14f-403b-b94f-b981184e1de0")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        mail = Mail()
        try:
            mail.send_mail(msg, "Partylite: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "Partylite: {0}".format(filename), self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = 'logs/{0}'.format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, logname), 'w') as f:
                f.write(msg)

    def get_id(self, url):
        """Gets id from product url."""
        return [url.split("&sku=")[1]]

    def get_server_path(self, url):
        """Gets server path for image url."""
        url = url.split("partylite.biz")[1]
        return self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg"

    def get_server_path_field(self, urls):
        """Getting server path for field of image urls."""
        new = []
        for url in urls:
            url = url.split("partylite.biz")[1]
            new.append(self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg")
        return new

    def remove_spaces(self, field):
        new = []
        for i in field:
            new.append(i.replace(' ', ''))
        return new

    def get_lists_from_excel(self):
        excel_path = "xls/{0}/{1}.xls".format(self.name, self.d['file'])
        xls = PartyliteExcel(path=excel_path, user=self.users['us'], production=self.production)
        self.products = dict()
        try:
            self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15)
            self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15)
            self.products['names'] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            exc.code_handler(103, msg=msg)
        self.products= xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

Exemple #7

0

Afficher le fichier

Fichier : burton_spider.py Projet : marjevtic/testMarko

class BurtonSpider(CrawlSpider):
    name = "burton"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(BurtonSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5, "Burton")
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(
                self.d['catalog_id'], self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.handle_not_provided()
        burton.add_properties(self.xml)
        self.start_urls = self.products['urls']
        self.start_urls = [
            "http://www.dickssportinggoods.com/product/index.jsp?productId=13243074"
        ]
        self.images_store = "/" + settings['IMAGES_STORE']
        self.total = len(self.start_urls)

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = BurtonItem()
        page = hxs.extract()
        if 'redirect_urls' in response.request.meta:
            cur_url = response.request.meta['redirect_urls'][0]
        else:
            cur_url = response.url
        index = self.products['urls'].index(cur_url)
        try:
            if 'redirect_urls' in response.request.meta:
                item['product_id'] = [self.products['product_ids'][index]]
                item['name'] = [self.products['names'][index]]
                item['in_stock'] = ["NOT_AVAILABLE"]
                self.exc.code_handler(102, response.url)
                self.xml.create_xml(item)
                self.products["status"][index] = "no_avail"
            else:
                item['product_id'], item['name'] = self.get_basic_info(hxs)
                item['description'], item['features'] = self.get_description(
                    hxs)
                item['variants'], thumb_urls, color_names = self.get_variants(
                    page)
                item['all_sizes'] = self.get_all_sizes(page)
                item['color_json'], image_urls = self.get_colors(
                    page, color_names)
                item['price'], item['old_price'] = self.get_prices(hxs)
                item['in_stock'] = ['IN_STOCK']
                item['product_link'] = [basic.cdata(response.url)]
                self.xml.create_xml(item)
                item['image_urls'] = image_urls + thumb_urls
                self.products["status"][index] = "ran"
        except:
            self.exc.code_handler(100, response.url)
            self.products["status"][index] = "error"
        else:
            return item

    def handle_not_provided(self):
        item = BurtonItem()
        for n in self.no_urls['product_ids']:
            item['product_id'] = [n]
            index = self.no_urls['product_ids'].index(n)
            item['name'] = [self.no_urls['names'][index]]
            item['in_stock'] = ['NOT_AVAILABLE']
            self.xml.create_xml(item)

    def get_basic_info(self, hxs):
        name = hxs.select('//h1[@class="productHeading"]/text()').extract()
        product_id = hxs.select('//input[@name="productId"]/@value').extract()
        return product_id, name

    def get_server_path(self, url):
        path = self.images_store + "/full/" + hashlib.sha1(
            url).hexdigest() + ".jpg"
        return path

    def get_prices(self, hxs):
        price = hxs.select('//div[@class="op"]/text()').extract()
        price = [basic.get_price(price[0])]
        old_price = hxs.select('//span[@class="lp"]/text()').extract()
        if old_price:
            old_price = [basic.get_price(old_price[0])]
        return price, old_price

    def get_description(self, hxs):
        description = hxs.select(
            '//div[@id="FieldsetProductInfo"]/text()').extract()[3]
        features = hxs.select('//div[@id="FieldsetProductInfo"]/ul').extract()
        if features:
            features = [features[0][:2000]]
        return [basic.cdata(description)], basic.cdata_field(features)

    def get_variants(self, page):
        """Gets jsons for colors with all available sizes.
        In json are also fetched all information for sizes that are on the site
        """
        script = basic.get_middle_text(page,
                                       'var skuSizeColorObj = new Array();',
                                       '</script>')[0]
        sizes = []
        image_urls = []
        color_names = []
        colors = script.split('skuSizeColorObj')
        for c in range(1, len(colors)):
            temp = basic.get_middle_text(colors[c], '= ', ';')
            # delete swatch image as it obviously won't be needed
            t = simplejson.loads(burton.replace_for_json(temp[0]))
            image_urls.append(t['swatchURL'])
            color_names.append(t['ColorDesc'])
            t['swatchURL'] = self.get_server_path(t['swatchURL'])
            sizes.append(basic.cdata(simplejson.dumps(t)))
        return sizes, image_urls, color_names

    def get_all_sizes(self, page):
        script = basic.get_middle_text(page, 'var distsizeobj=new Array();',
                                       'var indexcolor=0;')[0]
        all_sizes = basic.get_middle_text(script, ']="', '";')
        return [basic.cdata(simplejson.dumps(all_sizes))]

    def get_colors(self, page, color_names):
        """Gets color information with images from javascript on the page.
        Returns  json with color name and imagself.images_store = "/" + settings['IMAGES_STORE']e url for that color, and
        returnes filed of image urls that can be used for download later"""
        script = basic.get_middle_text(page, 'var imageMap_0 = new Array();',
                                       '</script>')[0]
        colors = basic.get_middle_text(script, '] = ', ';')
        image_urls = []
        colors_json = []
        for i in range(0, len(color_names)):
            color = burton.replace_color_json(colors[i])
            color = simplejson.loads(color)
            color['cname'] = color_names[i]
            color.pop('reg')
            image_urls.append(color['enh'])
            color['enh'] = self.get_server_path(color['enh'])
            colors_json.append(basic.cdata(simplejson.dumps(color)))
        return colors_json, image_urls

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}".format(datetime.now())
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter,
                                                         self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        if self.d['upload']:
            exp = CommonExport()
            try:
                exp.xml_to_db(self.name, filename,
                              "4ea95a81-90fb-49e2-837e-acf5ab58f574")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        # part for exporting to database here
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "Burton: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "Burton: {0}".format(filename),
                               self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15)
            self.products["product_ids"] = xls.read_excel_collumn_for_ids(
                1, 15)
            self.products["names"] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        self.products = xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

Exemple #8

0

Afficher le fichier

class LydiasSpider(CrawlSpider):
    name = "lydias"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(LydiasSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = VariantsXml()
        self.exc = ZmagsException(5)
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(
                self.d['catalog_id'], self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        # fix for bug with links they provide
        self.products['urls'] = basic.cut_string_field(self.products['urls'],
                                                       "&cat=")
        self.handle_not_provided()
        self.start_urls = self.products['urls']
        self.images_store = "/" + settings['IMAGES_STORE']
        lydias.add_properties(self.xml)
        self.total = len(self.products['urls'])

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = LydiasItem()
        if 'redirect_urls' in response.request.meta:
            cur_url = response.request.meta['redirect_urls'][0]
        else:
            cur_url = response.url
        index = self.products['urls'].index(cur_url)
        id = self.products['product_ids'][index]
        try:
            available = hxs.select('//div[@id="searchfor"]/text()').extract()
            if not available:
                item['product_id'] = [id]
                item['name'], item['price'], item['old_price'], item[
                    'description'] = self.get_basic_info(hxs)
                item['rating'], item['custom_rating'] = self.get_rating(hxs)
                chart = self.absolute_path(self.get_size_image(hxs))
                item['sizes_chart_image_url'] = self.get_server_path(chart)
                color_urls, color_names, product_image, color_codes = self.get_image_swatches(
                    hxs)
                color_urls = self.absolute_path(color_urls)
                item['color_image_url'] = self.make_colors_json(
                    color_urls, color_names, color_codes)
                item['in_stock'] = ["IN_STOCK"]
                item['embroidery'] = self.get_embroidery(hxs)
                default_images = self.absolute_path(self.get_extra_images(hxs))
                item['default_image_url'] = self.get_server_path(
                    default_images)
                self.xml.create_xml(item)
                product_image = self.absolute_path(product_image)
                self.create_subproducts(id, color_names, product_image,
                                        color_codes, hxs)
                item[
                    'image_urls'] = product_image + color_urls + chart + default_images
                self.products['status'][index] = "ran"
            else:
                self.exc.code_handler(102, response.url)
                item['product_id'] = [id]
                item['in_stock'] = ["NOT_AVAILABLE"]
                self.products['status'][index] = "not_avail"
                self.xml.create_xml(item)
        except:
            self.products['status'][index] = "error"
            self.exc.code_handler(100, response.url)
        return item

    # function for checking if product has embroidery or not
    def get_embroidery(self, hxs):
        page = hxs.select('//html').extract()[0]
        if "document.getElementById('logocolor').disabled = true;" in page:
            return ["True"]
        else:
            return ["False"]

    # function for creating json with all information for colors
    def make_colors_json(self, color_urls, color_names, color_codes):
        dict = {}
        jsons = []
        for i in range(0, len(color_urls)):
            dict['color_url'] = self.get_server_path_single(color_urls[i])
            dict['color_name'] = color_names[i]
            dict['color_short'] = color_codes[i]
            json = basic.cdata(simplejson.dumps(dict))
            jsons.append(json)
        return jsons

    # function for getting image server path
    def get_server_path_single(self, url):
        #        return url
        return self.images_store + "/full/" + hashlib.sha1(
            url).hexdigest() + ".jpg"

    # function for getting image path for field of images
    def get_server_path(self, urls):
        #        return urls
        new = []
        for url in urls:
            new.append(self.images_store + "/full/" +
                       hashlib.sha1(url).hexdigest() + ".jpg")
        return new

    #function for getting basic information for product
    def get_basic_info(self, hxs):
        name = hxs.select('//div[@id="proddetail"]/h1/text()').extract()
        price = hxs.select(
            '//div[@id="proddetail"]/div[@class="yourprice bigprice"]/text()'
        ).extract()
        description = basic.cdata(
            hxs.select('//div[@id="details"]').extract()[0])
        description = basic.clean_string(description)
        old_price = hxs.select(
            '//span[@class="yourprice_product"]/text()').extract()
        if not price:
            price = hxs.select('//span[@id="PriceDisplay"]/text()').extract()
        if old_price:
            old_price = [re.sub('[^0-9.]', '', old_price[0])]
        price = [re.sub('[^0-9.]', '', price[0])]
        return name, price, old_price, [description]

    # function for getting rating, both number and sentence (e.g. Rating 5 out of 6 votes)
    def get_rating(self, hxs):
        temp = hxs.select('//div[@id="Customerssay"]/p[2]/text()').extract()
        if temp:
            rating = basic.get_middle_text(temp[0].replace(" ", ""), "Rating:",
                                           "out")
            return rating, temp
        else:
            return [], temp

    #function for getting reviews, returning rating and field of json reviews
    # or empty fields if there's no reviews
    def get_reviews(self, hxs):
        reviews = hxs.select('//div[@class="prodReview"]')
        if reviews:
            title = reviews[0].select(
                'p[@class="review_title"]/text()').extract()
            text = reviews[0].select(
                'p[@class="review_text"]/text()').extract()
            author = reviews[0].select(
                'p[@class="review_author"]/text()').extract()
            location = reviews[0].select(
                'p[@class="review_location"]/text()').extract()
            jsons = self.make_reviews_json(title, text, author, location)
            return jsons
        else:
            return []

    # function for making json for reviews
    # currently not in use. cause there are no reviews in DPW design
    def make_reviews_json(self, title, text, author, location):
        jsons = []
        print len(title)
        print len(text)
        print len(author)
        print len(location)
        os._exit(0)
        for i in range(0, len(title)):
            json = '{ "title" : " %s ", "text" : "%s", "author" : "%s", "location" :\
                    "%s" }' % (title[i], text[i], author[i], location[i])
            json = basic.cdata(json)
            jsons.append(json)
        return jsons

    #function for getting size chart image
    def get_size_image(self, hxs):
        temp = hxs.select(
            '//div[@class="TabbedPanelsContent cells"]/img/@src').extract()
        return temp

    #function for getting image swatches, returning fields (image_urls, image name, product color image)
    def get_image_swatches(self, hxs):
        colors = hxs.select('//div[@class="lolite"]')
        color_images = []
        color_names = []
        products_image = []
        color_codes = []
        for color in colors:
            color_images.append(color.select('a/img/@src').extract()[0])
            color_names.append(color.select('a/img/@alt').extract()[0])
            #if zoom image needed, this is the place to get it
            products_image.append(color.select('a/@rev').extract()[0])
            color_codes.append(
                color.select('a/@onclick').extract()[0].split(",")[1].replace(
                    "'", ""))
        return color_images, color_names, products_image, color_codes

    #function for getting additional images, returns field of images or empty field if there is no
    def get_extra_images(self, hxs):
        additional_images = hxs.select(
            '//div[@id="AddImg"]/script/text()').extract()
        if additional_images:
            temp = basic.get_middle_text(additional_images[0], '"', '"')
            thumb_images = temp[0].split(",")
            return thumb_images
        else:
            return []

    #function for getting product id from the page
    def get_product_id(self, hxs):
        temp = hxs.select('//div[@id="wrap"]/script/text()').extract()
        id = basic.get_middle_text(temp[0], 'productid","', '"')
        return id[0]

    # function for getting sizes from another url, retunrning field of jsons for sizes
    # one id from the page is 115NB, if needed here to hardcode for testing
    # currently not in use
    def get_sizes(self, id, hxs):
        showmode = hxs.select('//input[@name="showmode"]/@value').extract()[0]
        itemmode = hxs.select('//input[@name="itemmode"]/@value').extract()[0]
        salemode = hxs.select('//input[@name="salemode"]/@value').extract()[0]
        url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=AV&opt2=-1&type2=l1type" % (
            id)
        url += "&type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=429" % (
            showmode, itemmode, salemode)
        jsons = []
        print "reading page..."
        page = urllib2.urlopen(url).read()
        print "page read"
        page = page.replace("'", "")
        page = page.replace("[", ",")
        page = page.replace(",,", "")
        temp = page.split("]")
        for i in range(0, len(temp) - 2):
            tmp = temp[i].split(",")
            json = '{ "size_short" : " %s ", "size_full" : "%s", "some_number" :\
                    "%s", "some_id" : "%s" }' % (tmp[0], tmp[1], tmp[2],
                                                 tmp[3])
            json = basic.cdata(json)
            jsons.append(json)
        return jsons

    # function that handles creating subproducts, can be implemented for the usual way product for every combination
    # of size and color if needed
    def create_subproducts(self, id, color_names, product_image, color_codes,
                           hxs):
        item = LydiasItem()
        # if no colors for specific product do this part and call to creating size children with empty string instead
        # of actual color name
        if len(color_names) == 0:
            item['master_product_id'] = [id]
            item['product_id'] = [id + "_" + "0"]
            item['color'] = ["NO_COLOR"]
            item['custom_size'] = self.create_sizes_subproducts(
                id, id + "_" + "0", "", hxs)
            self.xml.create_xml(item)

        # for handling cases when there are color options for specific product, create child for every color, and call
        # for creating size children for every provided color
        else:
            for i in range(0, len(color_names)):
                print "name :" + color_names[i] + "  code:" + color_codes[i]
                item['master_product_id'] = [id]
                item['product_id'] = [id + "_" + str(i)]
                item['color'] = [color_names[i]]
                item['color_short'] = [color_codes[i]]
                item['normal_image_url'] = self.get_server_path(
                    [product_image[i]])
                item['in_stock'] = ["IN_STOCK"]
                item['custom_size'] = self.create_sizes_subproducts(
                    id, id + "_" + str(i), color_codes[i], hxs)
                self.xml.create_xml(item)
                item.clear()
        return 0

    # function for creating child products for sizes
    # little messy with all the commented lines but those lines can be used if needed to go back to old way with
    # child products instead of json
    def create_sizes_subproducts(self, main_id, id, color_code, hxs):
        print color_code
        jsons = []
        # if block for cases when color is provided
        if color_code != "":
            showmode = hxs.select(
                '//input[@name="showmode"]/@value').extract()[0]
            itemmode = hxs.select(
                '//input[@name="itemmode"]/@value').extract()[0]
            salemode = hxs.select(
                '//input[@name="salemode"]/@value').extract()[0]
            url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=%s&opt2=-1&type2=l1type&" \
                "type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=193" % (main_id, color_code, showmode, itemmode, salemode)
            page = urllib2.urlopen(url).read()
            page = page.replace("'", "")
            page = page.replace("[", ",")
            page = page.replace(",,", "")
            temp = page.split("]")
            for i in range(0, len(temp) - 2):
                tmp = temp[i].split(",")
                item = {}
                #                item['master_product_id'] = [id]
                item['size_short'] = tmp[0]
                item['price_url'] = self.get_size_price(
                    str(main_id), str(color_code), tmp[0])
                item['size'] = tmp[1]
                #                item['product_id'] = [id + "_" + str(i)]
                #                item['in_stock'] = ["IN_STOCK"]
                #                xml.create_xml(item)
                jsons.append(basic.cdata(simplejson.dumps(item)))
            return jsons

        # when the color is not provided different block of code cause it's done differently on the page
        else:
            temp = hxs.select('//div[@class="not_size"]/text()').extract()
            for i in range(0, len(temp)):
                item = {}
                #                item['master_product_id'] = [id]
                #                item['product_id'] = [id + "_" + str(i)]
                item['size_short'] = temp[i]
                item['price_url'] = self.get_size_price(
                    str(main_id), "", temp[i])
                #                item['in_stock'] = ["IN_STOCK"]
                #                xml.create_xml(item)
                jsons.append(basic.cdata(simplejson.dumps(item)))
            return jsons

#        return 0

# function for getting price for combination of every size and color, can return url where the price is, or can
# parse that url to get that actual price but will drastically increase scraping time

    def get_size_price(self, id, color, size):
        if color != "":
            url = "http://www.lydiasuniforms.com/ajaxed/product-showprice.asp?sku=%s %s %s&qty=1&itemmode=" \
                  "0&showmode=1&rnum=388" % (str(id), str(color), size)
        else:
            url = "http://www.lydiasuniforms.com/ajaxed/product-showprice.asp?sku=%s %s&qty=1&itemmode=" \
                  "0&showmode=1&rnum=259" % (id, size)
        url = url.replace(" ", "%20")
        return url

    # just adding part for getting absolute paths for relative paths from page
    def absolute_path(self, urls):
        new = []
        for i in urls:
            new.append("http://www.lydiasuniforms.com" + i)
        return new

    # function used for gettin embroidery information from clients page, was used only once to get it
    # cause embroidery is the same for all the products
    def get_emb(self, hxs):
        emb = hxs.select('//div[@id="emb"]').extract()
        lettering_colors = hxs.select(
            '//select[@id="threadcolor"]/option/@value').extract()
        urls = []
        d = {}
        colors = []
        for i in range(1, len(lettering_colors)):
            d['type'] = "lettering colors"
            d['name'] = lettering_colors[i]
            url = "http://www.lydiasuniforms.com/images/lydias/threadcolor_"
            url += lettering_colors[i].lower().replace(' ', '_') + ".gif"
            d['url'] = self.get_server_path_single(url)

            urls.append(url)
            colors.append(basic.cdata(simplejson.dumps(d)))
        lettering = hxs.select(
            '//select[@id="lettering"]/option/@value').extract()
        l = {}
        letterings = []
        for i in range(1, len(lettering)):
            l['type'] = "lettering"
            l['name'] = lettering[i]
            url = "http://www.lydiasuniforms.com/images/lydias/lettering_"
            url += lettering[i].lower().replace(' ', '_') + ".gif"
            l['url'] = self.get_server_path_single(url)
            letterings.append(basic.cdata(simplejson.dumps(l)))
            urls.append(url)
        logo = hxs.select('//select[@id="logoname"]/option/@value').extract()
        logos = {}
        log = []
        for i in range(1, len(logo)):
            logos['type'] = "logo"
            logos['name'] = logo[i]
            url = "http://www.lydiasuniforms.com/images/logos/"
            url += logo[i].lower() + ".jpg"
            logos['url'] = self.get_server_path_single(url)
            urls.append(url)
            log.append(basic.cdata(simplejson.dumps(logos)))
        item = LydiasItem()
        item['color'] = colors
        item['lettering'] = letterings
        item['log'] = log
        xml.create_xml(item)
        xml.write_xml("emb")

        return urls
        print colors, letterings, log
        os._exit(0)

    def handle_not_provided(self):
        item = LydiasItem()
        for n in self.no_urls['product_ids']:
            item['product_id'] = [n]
            index = self.no_urls['product_ids'].index(n)
            item['name'] = [self.no_urls['names'][index]]
            item['in_stock'] = ['NOT_AVAILABLE']
            self.xml.create_xml(item)

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = ""
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter,
                                                         self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        #if self.d['upload']:
        #exp = CommonExport()
        #try:
        #exp.xml_to_db(self.name, filename, "4b0d6b52-7b05-4e54-9d87-dfe77ac270c9")
        #msg += "\n\nExport to database successful"
        #except StandardError:
        #msg += "\n\nExport to database failed"
        #else:
        #msg += "\n\nUpload to database not selected"
        ## part for exporting to database here
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "Lydias: {0}".format(filename))
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15)
            self.products['product_ids'] = xls.read_excel_collumn_for_ids(
                1, 15)
            self.products['names'] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        else:
            self.products = xls.delete_duplicates_dict(self.products)
            self.products, self.no_urls = xls.separate_no_urls(self.products)
            self.products = xls._add_none_status(self.products)
            self.no_urls = xls._add_none_status(self.no_urls)

Exemple #9

0

Afficher le fichier

Fichier : chome_spider.py Projet : marjevtic/testMarko

class ChomeSpider(CrawlSpider):
    name = "chome"
    allowed_domains = ["zmags.com"]
    start_urls = ["http://www.zmags.com/"]
    counter = 0

    def __init__(self, *a, **kw):
        super(ChomeSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5)
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                        self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.images_store = "/" + settings['IMAGES_STORE']
        self.total = len(self.no_urls['product_ids'])

    def parse(self, response):
        self.counter += 1
        hxs = HtmlXPathSelector(response)
        item = ChomeItem()
        print "IDs in excel feed: {0}".format(self.total)
        item['image_urls'] = self.parse_whole_xml()
        return item

    def parse_whole_xml(self):
        xml_dir = "xml/{0}".format(self.name)
        file_url = "https://svc.celebratinghome.com/ZMags.svc/ProductInfo1"
        downloader = Downloader()
        if self.d['download']:
            downloader.get_file(xml_dir, file_url, "client_feed")
        else:
            if not os.path.exists('xml/{0}/client_feed.xml'.format(self.name)):
                basic.warning("Feed file doesn't exist please de-select no download option")
                os._exit(2)
        self.number = 0
        xml_item = ChomeItem()
        urls_all = []
        for event, elem in iterparse('xml/{0}/client_feed.xml'.format(self.name)):
            if elem.tag == "{http://schemas.microsoft.com/ado/2007/08/dataservices/metadata}properties":
                for r in elem:
                    p = "{http://schemas.microsoft.com/ado/2007/08/dataservices}"
                    if r.tag == p + "Id" and r.text in self.no_urls['product_ids']:
                        index = self.no_urls['product_ids'].index(r.text)
                        self.no_urls['status'][index] = 'ran'
                        self.number += 1
                        urls = []
                        flag = 0
                        for x in elem:
                            if x.tag == p + "Id":
                                xml_item['product_id'] = [x.text]
                            elif x.tag == p + "EngLongDesc" and x.text is not None:
                                xml_item['description_english'] = [self.escape(basic.cdata(x.text))]
                            elif x.tag == p + "RetailPrice":
                                xml_item['custom_price'] = [x.text[:-2]]
                            elif x.tag == p + "SpnLongDesc" and x.text is not None:
                                xml_item['description_spanish'] = [self.escape(basic.cdata(x.text))]
                            elif x.tag == p + "PartNumber":
                                xml_item['add_to_cart_id'] = [x.text]
                            elif x.tag == p + "MaxQty":
                                xml_item['max_qty'] = [x.text]
                            elif x.tag == p + "TimeType":
                                xml_item['time_type'] = [x.text]
                            elif x.tag == p + "SpnName" and x.text is not None:
                                xml_item['name_spanish'] = [x.text]
                            elif x.tag == p + "EngName":
                                xml_item['name_english'] = [x.text]
                            elif x.tag == p + "ImagePath_Large" and x.text is not None:
                                urls.append(self.get_absolute(x.text))
                                xml_item['normal_image_url'] = [self.get_server_path(self.get_absolute(x.text))]
                            elif x.tag == p + "IsActive":
                                if x.text == 0:
                                    xml_item['in_stock'] = ["NOT_IN_STOCK"]
                                else:
                                    xml_item['in_stock'] = ['IN_STOCK']
                            else:
                                for i in range(1, 4):
                                    tag = p + "Alternate%sImagePath_Large" % (str(i))
                                    if x.tag == tag and x.text is not None:
                                        urls.append(self.get_absolute(x.text))
                                        xml_item['normal_image_url'].append(self.get_server_path(self.get_absolute(x.text)))
                                        # change image paths for normal_image_url and return urls
                        self.xml.create_xml(xml_item)
                        urls_all += urls
        for i in range(0, len(self.no_urls['status'])):
            if self.no_urls['status'][i] != 'ran':
                self.no_urls['status'][i] = 'not_found'
        return urls_all

    def get_server_path(self, url):
        path = self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg"
        return path

    def get_absolute(self, url):
        return "http://www.celebratinghome.com/" + url

    def escape(self, string):
        temp = HTMLParser.HTMLParser().unescape(string)
        return HTMLParser.HTMLParser().unescape(temp)

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}\n".format(datetime.now())
        if self.total - self.number:
            msg += "{0} id(s) from id list weren't found in feed".format(self.total - self.number)
            basic.warning(msg)
        else:
            msg += "All ids found in feed."
            basic.green(msg)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.no_urls)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        #if self.d['upload']:
            #exp = CommonExport()
            #try:
                #exp.xml_to_db(self.name, self.d['file'], "40b029c9-dff7-4bc1-b8bc-ef062960b24d")
                #msg += "\n\nExport to database successful"
            #except StandardError:
                #msg += "\n\nExport to database failed"
        #else:
            #msg += "\n\nUpload to database not selected"
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "CelebratingHome: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "CelebratingHome: {0}".format(filename), self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15)
            self.products['names'] = xls.read_excel_collumn(2, 15)
            self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        self.products= xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

    def add_properties(self, xml):
        xml.add_property("description_english", "Description English", "text")
        xml.add_property("description_spanish", "Description Spanish", "text")
        xml.add_property("add_to_cart_id", "Add To Cart ID", "text")
        xml.add_property("max_qty", "Max Quantity", "text")
        xml.add_property("time_type", "Time Type", "text")
        xml.add_property("name_english", "Name English", "text")
        xml.add_property("name_spanish", "Name Spanish", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("custom_price", "Custom Price", "text")

Exemple #10

0

Afficher le fichier

class ChomeSpider(CrawlSpider):
    name = "chome"
    allowed_domains = ["zmags.com"]
    start_urls = ["http://www.zmags.com/"]
    counter = 0

    def __init__(self, *a, **kw):
        super(ChomeSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5)
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(
                self.d['catalog_id'], self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.images_store = "/" + settings['IMAGES_STORE']
        self.total = len(self.no_urls['product_ids'])

    def parse(self, response):
        self.counter += 1
        hxs = HtmlXPathSelector(response)
        item = ChomeItem()
        print "IDs in excel feed: {0}".format(self.total)
        item['image_urls'] = self.parse_whole_xml()
        return item

    def parse_whole_xml(self):
        xml_dir = "xml/{0}".format(self.name)
        file_url = "https://svc.celebratinghome.com/ZMags.svc/ProductInfo1"
        downloader = Downloader()
        if self.d['download']:
            downloader.get_file(xml_dir, file_url, "client_feed")
        else:
            if not os.path.exists('xml/{0}/client_feed.xml'.format(self.name)):
                basic.warning(
                    "Feed file doesn't exist please de-select no download option"
                )
                os._exit(2)
        self.number = 0
        xml_item = ChomeItem()
        urls_all = []
        for event, elem in iterparse('xml/{0}/client_feed.xml'.format(
                self.name)):
            if elem.tag == "{http://schemas.microsoft.com/ado/2007/08/dataservices/metadata}properties":
                for r in elem:
                    p = "{http://schemas.microsoft.com/ado/2007/08/dataservices}"
                    if r.tag == p + "Id" and r.text in self.no_urls[
                            'product_ids']:
                        index = self.no_urls['product_ids'].index(r.text)
                        self.no_urls['status'][index] = 'ran'
                        self.number += 1
                        urls = []
                        flag = 0
                        for x in elem:
                            if x.tag == p + "Id":
                                xml_item['product_id'] = [x.text]
                            elif x.tag == p + "EngLongDesc" and x.text is not None:
                                xml_item['description_english'] = [
                                    self.escape(basic.cdata(x.text))
                                ]
                            elif x.tag == p + "RetailPrice":
                                xml_item['custom_price'] = [x.text[:-2]]
                            elif x.tag == p + "SpnLongDesc" and x.text is not None:
                                xml_item['description_spanish'] = [
                                    self.escape(basic.cdata(x.text))
                                ]
                            elif x.tag == p + "PartNumber":
                                xml_item['add_to_cart_id'] = [x.text]
                            elif x.tag == p + "MaxQty":
                                xml_item['max_qty'] = [x.text]
                            elif x.tag == p + "TimeType":
                                xml_item['time_type'] = [x.text]
                            elif x.tag == p + "SpnName" and x.text is not None:
                                xml_item['name_spanish'] = [x.text]
                            elif x.tag == p + "EngName":
                                xml_item['name_english'] = [x.text]
                            elif x.tag == p + "ImagePath_Large" and x.text is not None:
                                urls.append(self.get_absolute(x.text))
                                xml_item['normal_image_url'] = [
                                    self.get_server_path(
                                        self.get_absolute(x.text))
                                ]
                            elif x.tag == p + "IsActive":
                                if x.text == 0:
                                    xml_item['in_stock'] = ["NOT_IN_STOCK"]
                                else:
                                    xml_item['in_stock'] = ['IN_STOCK']
                            else:
                                for i in range(1, 4):
                                    tag = p + "Alternate%sImagePath_Large" % (
                                        str(i))
                                    if x.tag == tag and x.text is not None:
                                        urls.append(self.get_absolute(x.text))
                                        xml_item['normal_image_url'].append(
                                            self.get_server_path(
                                                self.get_absolute(x.text)))
                                        # change image paths for normal_image_url and return urls
                        self.xml.create_xml(xml_item)
                        urls_all += urls
        for i in range(0, len(self.no_urls['status'])):
            if self.no_urls['status'][i] != 'ran':
                self.no_urls['status'][i] = 'not_found'
        return urls_all

    def get_server_path(self, url):
        path = self.images_store + "/full/" + hashlib.sha1(
            url).hexdigest() + ".jpg"
        return path

    def get_absolute(self, url):
        return "http://www.celebratinghome.com/" + url

    def escape(self, string):
        temp = HTMLParser.HTMLParser().unescape(string)
        return HTMLParser.HTMLParser().unescape(temp)

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}\n".format(datetime.now())
        if self.total - self.number:
            msg += "{0} id(s) from id list weren't found in feed".format(
                self.total - self.number)
            basic.warning(msg)
        else:
            msg += "All ids found in feed."
            basic.green(msg)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.no_urls)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        #if self.d['upload']:
        #exp = CommonExport()
        #try:
        #exp.xml_to_db(self.name, self.d['file'], "40b029c9-dff7-4bc1-b8bc-ef062960b24d")
        #msg += "\n\nExport to database successful"
        #except StandardError:
        #msg += "\n\nExport to database failed"
        #else:
        #msg += "\n\nUpload to database not selected"
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "CelebratingHome: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "CelebratingHome: {0}".format(filename),
                               self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products['product_ids'] = xls.read_excel_collumn_for_ids(
                1, 15)
            self.products['names'] = xls.read_excel_collumn(2, 15)
            self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        self.products = xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

    def add_properties(self, xml):
        xml.add_property("description_english", "Description English", "text")
        xml.add_property("description_spanish", "Description Spanish", "text")
        xml.add_property("add_to_cart_id", "Add To Cart ID", "text")
        xml.add_property("max_qty", "Max Quantity", "text")
        xml.add_property("time_type", "Time Type", "text")
        xml.add_property("name_english", "Name English", "text")
        xml.add_property("name_spanish", "Name Spanish", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("custom_price", "Custom Price", "text")

Exemple #11

0

Afficher le fichier

Fichier : burton_spider.py Projet : marjevtic/testMarko

class BurtonSpider(CrawlSpider):
    name = "burton"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(BurtonSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5, "Burton")
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                        self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.handle_not_provided()
        burton.add_properties(self.xml)
        self.start_urls = self.products['urls']
        self.start_urls = ["http://www.dickssportinggoods.com/product/index.jsp?productId=13243074"]
        self.images_store = "/" + settings['IMAGES_STORE']
        self.total = len(self.start_urls)

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = BurtonItem()
        page = hxs.extract()
        if 'redirect_urls' in response.request.meta:
            cur_url = response.request.meta['redirect_urls'][0]
        else:
            cur_url = response.url
        index = self.products['urls'].index(cur_url)
        try:
            if 'redirect_urls' in response.request.meta:
                item['product_id'] = [self.products['product_ids'][index]]
                item['name'] = [self.products['names'][index]]
                item['in_stock'] = ["NOT_AVAILABLE"]
                self.exc.code_handler(102, response.url)
                self.xml.create_xml(item)
                self.products["status"][index] = "no_avail"
            else:
                item['product_id'], item['name'] = self.get_basic_info(hxs)
                item['description'], item['features'] = self.get_description(hxs)
                item['variants'], thumb_urls, color_names = self.get_variants(page)
                item['all_sizes'] = self.get_all_sizes(page)
                item['color_json'], image_urls = self.get_colors(page, color_names)
                item['price'], item['old_price'] = self.get_prices(hxs)
                item['in_stock'] = ['IN_STOCK']
                item['product_link'] = [basic.cdata(response.url)]
                self.xml.create_xml(item)
                item['image_urls'] = image_urls + thumb_urls
                self.products["status"][index] = "ran"
        except:
            self.exc.code_handler(100, response.url)
            self.products["status"][index] = "error"
        else:
            return item

    def handle_not_provided(self):
        item = BurtonItem()
        for n in self.no_urls['product_ids']:
            item['product_id'] = [n]
            index = self.no_urls['product_ids'].index(n)
            item['name'] = [self.no_urls['names'][index]]
            item['in_stock'] = ['NOT_AVAILABLE']
            self.xml.create_xml(item)

    def get_basic_info(self, hxs):
        name = hxs.select('//h1[@class="productHeading"]/text()').extract()
        product_id = hxs.select('//input[@name="productId"]/@value').extract()
        return product_id, name

    def get_server_path(self, url):
        path = self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg"
        return path

    def get_prices(self, hxs):
        price = hxs.select('//div[@class="op"]/text()').extract()
        price = [basic.get_price(price[0])]
        old_price = hxs.select('//span[@class="lp"]/text()').extract()
        if old_price:
            old_price = [basic.get_price(old_price[0])]
        return price, old_price

    def get_description(self, hxs):
        description = hxs.select('//div[@id="FieldsetProductInfo"]/text()').extract()[3]
        features = hxs.select('//div[@id="FieldsetProductInfo"]/ul').extract()
        if features:
            features = [features[0][:2000]]
        return [basic.cdata(description)], basic.cdata_field(features)

    def get_variants(self, page):
        """Gets jsons for colors with all available sizes.
        In json are also fetched all information for sizes that are on the site
        """
        script = basic.get_middle_text(page, 'var skuSizeColorObj = new Array();', '</script>')[0]
        sizes = []
        image_urls = []
        color_names = []
        colors = script.split('skuSizeColorObj')
        for c in range(1, len(colors)):
            temp = basic.get_middle_text(colors[c], '= ', ';')
            # delete swatch image as it obviously won't be needed
            t = simplejson.loads(burton.replace_for_json(temp[0]))
            image_urls.append(t['swatchURL'])
            color_names.append(t['ColorDesc'])
            t['swatchURL'] = self.get_server_path(t['swatchURL'])
            sizes.append(basic.cdata(simplejson.dumps(t)))
        return sizes, image_urls, color_names

    def get_all_sizes(self, page):
        script = basic.get_middle_text(page, 'var distsizeobj=new Array();', 'var indexcolor=0;')[0]
        all_sizes = basic.get_middle_text(script, ']="','";')
        return [basic.cdata(simplejson.dumps(all_sizes))]

    def get_colors(self, page, color_names):
        """Gets color information with images from javascript on the page.
        Returns  json with color name and imagself.images_store = "/" + settings['IMAGES_STORE']e url for that color, and
        returnes filed of image urls that can be used for download later"""
        script = basic.get_middle_text(page, 'var imageMap_0 = new Array();', '</script>')[0]
        colors = basic.get_middle_text(script, '] = ', ';')
        image_urls = []
        colors_json = []
        for i in range(0, len(color_names)):
            color = burton.replace_color_json(colors[i])
            color = simplejson.loads(color)
            color['cname'] = color_names[i]
            color.pop('reg')
            image_urls.append(color['enh'])
            color['enh'] = self.get_server_path(color['enh'])
            colors_json.append(basic.cdata(simplejson.dumps(color)))
        return colors_json, image_urls

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}".format(datetime.now())
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        if self.d['upload']:
            exp = CommonExport()
            try:
                exp.xml_to_db(self.name, filename, "4ea95a81-90fb-49e2-837e-acf5ab58f574")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        # part for exporting to database here
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "Burton: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "Burton: {0}".format(filename), self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15)
            self.products["product_ids"] = xls.read_excel_collumn_for_ids(1, 15)
            self.products["names"] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        self.products= xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

Exemple #12

0

Afficher le fichier

class BootsSpider(CrawlSpider):
    name = "boots"
    allowed_domains = ["zmags.com"]
    start_urls = ["http://www.zmags.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(BootsSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5)
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                        self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.handle_not_provided()
        self.start_urls = self.products['urls']
        self.total = len(self.products['urls'])

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = BootsItem()
        item['product_id'], item['store_id'], item['lang_id'], item['catalog_id'] = self.get_ids(hxs)
        item['name'] = self.get_name(hxs)
        item['short_description'], sponsored, description, in_stock, item['ingredients'], patient_information_url, item['offer'], item['promotion'] = self.get_description(hxs)
        item['rating'] = self.get_rating(hxs)
        size, price_per_size = self.get_size(hxs)
        item['normal_image_url'], image_urls = self.get_images(hxs)
        brand, brand_image_url = self.get_brand(hxs)
        item['save_money'], item['old_price'] = self.get_oldies(hxs)
        for i in range(0, len(description)):
            tag = 'description_%d' % (i + 1)
            item[tag] = [basic.cdata(description[i])]
        if sponsored is not None:
            item['sponsored'] = sponsored
        item['in_stock'] = ["NOT_IN_STOCK"]
        if in_stock == "In stock":
            item['in_stock'] = ["IN_STOCK"]
            item['order_id'] = hxs.select('//input[@name="orderId"]/@value').extract()
            item['cat_entry_id'] = hxs.select('//input[@name="catEntryId"]/@value').extract()
            item['calculation_usage_id'] = hxs.select('//input[@name="calculationUsageId"]/@value').extract()
        if brand_image_url is not None:
            item['brand'] = brand
            item['brand_image_url'] = ["43662980-f344-11e1-a21f-0800200c9a66/full/" + self.get_image_sha1(brand_image_url)]
            image_urls.append(brand_image_url)
        if patient_information_url is not None:
            item['patient_information_url'] = [basic.cdata(patient_information_url)]
        prices, point_prices, collect_points, colors, color_image_urls, variant_ids = self.get_color_variants(hxs)
        if size is not None:
            item['size'] = size
            item['price_per_size'] = price_per_size
        elif variant_ids is None:
            prices, point_prices, collect_points, sizes, variant_ids = self.get_size_variants(hxs)
        if color_image_urls is not None:
            image_urls.extend(color_image_urls)
        if variant_ids is not None:
            self.xml.create_xml(item)
            if colors is not None:
                self.create_color_variants(prices, point_prices, colors, color_image_urls, variant_ids, collect_points, item['product_id'])
            else:
                self.create_size_variants(prices, point_prices, sizes, variant_ids, collect_points, item['product_id'])
        else:
            prices = hxs.select('//p[@class="price"]/text()').extract()[0]
            point_prices = hxs.select('//span[@class="pointsPrice"]/text()').extract()[0]
            collect_points = [basic.get_price(hxs.select('//p[@class="collectPoints"]/text()').extract()[0])]
            item['price'] = [basic.get_price(prices)]
            item['points_price'] = [basic.get_price(point_prices)]
            item['collect_points'] = collect_points
            self.xml.create_xml(item)
        item['image_urls'] = image_urls
        #raw_input("Press Enter to continue...")
        return item

    def handle_not_provided(self):
        item = BootsItem()
        for n in self.no_urls['product_ids']:
            item['product_id'] = [n]
            index = self.no_urls['product_ids'].index(n)
            item['name'] = [self.no_urls['names'][index]]
            item['in_stock'] = ['NOT_AVAILABLE']
            self.xml.create_xml(item)

    def get_ids(self, hxs):
        product_id = hxs.select('//input[@name="productId"]/@value').extract()[0]
        store_id = hxs.select('//input[@name="storeId"]/@value').extract()[0]
        lang_id = hxs.select('//input[@name="langId"]/@value').extract()[0]
        catalog_id = hxs.select('//input[@name="catalogId"]/@value').extract()[0]
        return [product_id], [store_id], [lang_id], [catalog_id]
        
    def get_name(self, hxs):
        name = hxs.select('//span[@class="pd_productNameSpan"]/text()').extract()[0]
        return [name]
    
    def get_description(self, hxs):
        short_description = hxs.select('//div[@class="productIntroCopy"]').extract()[0]
        try:
            suitable_for = ''.join(hxs.select('//div[@id="suitableFor"]//h4 | //div[@id="suitableFor"]//p | //div[@id="suitableFor"]//div').extract())
            short_description += suitable_for
        except:
            print "There's no suitable_for section"
        try:
            ingredients = basic.clean_string(' '.join(hxs.select('//div[@class="pd_panel"][not(@id)]//div[@class="pd_HTML"]/p | //div[@class="pd_panel"][not(@id)]//div[@class="pd_HTML"]//div').extract()))
            if ingredients != '':
                ingredients = basic.cdata(ingredients)
        except:
            print "No ingredients found!"
            ingredients = None
        try:
            patient_information_url = hxs.select('//div[@class="downloadMedia"]//a/@href').extract()[0]
        except:
            print "No patient information found!"
            patient_information_url = None
        try:
            offer = hxs.select('//div[@id="mainOffer"]//a/text()').extract()[0]
        except:
            print "No special offer found!"
            offer = None
        try:
            promotion = hxs.select('//div[@id="otherOffers"]//a/text()').extract()
        except:
            print "No promotion found!"
            promotion = None
        try:
            sponsored = hxs.select('//div[@class="sponsored"]//p/text()').extract()[0]
        except:
            print "No sponsor message found!"
            sponsored = None
        description = ''.join(hxs.select('//div[@id="detailedInfo"]//div[@class="pd_panelInner"]//div[@class="pd_HTML"]').extract())
        description = basic.clean_string(description)
        description_overflow = len(description)/2000
        desc = []
        if description_overflow > 0:
            for i in range(0, description_overflow + 1):
                if i < description_overflow:
                    desc.append(description[2000*(i):2000*(i+1)-1])
                else:
                    desc.append(description[2000*i:])
        else:
            desc = [description]
        try:
            in_stock = hxs.select('//div[@class="icon_pl_stock"]/text()').extract()[0]
        except:
            in_stock = ""
        return [basic.cdata(basic.clean_string(short_description))], [sponsored], desc, in_stock, [ingredients], patient_information_url, [offer], promotion
    
    def get_images(self, hxs):
        image_urls = []
        normal_image_url = hxs.select('//meta[@property="og:image"]//@content').extract()[0]
        image_urls.append(normal_image_url)
        normal_image_url = "43662980-f344-11e1-a21f-0800200c9a66/full/" + self.get_image_sha1(normal_image_url)
        return [normal_image_url], image_urls
    
    def get_brand(self, hxs):
        try:
            brand = hxs.select('//div[@class="pd_brand"]//div//a//span//img/@alt').extract()[0]
            brand_image_url = hxs.select('//div[@class="pd_brand"]//div//a//span//img/@src').extract()[0]
            return [brand], brand_image_url
        except:
            print "No brand name or image found!"
            return None, None
    
    def get_rating(self, hxs):
        try:
            rating = hxs.select('//span[@property="v:average"]/text()').extract()[0]
        except:
            rating = "0.0"
        return [rating]
    
    def get_size(self, hxs):
        try:
            size = hxs.select('//span[@class="size"]/text()').extract()[0]
            size = basic.clean_string(size)
            size = size.replace("|", "")
            price_per_size = hxs.select('//span[@class="pricePerSize"]/text()').extract()[0]
            return [size], [price_per_size]
        except:
            print "No size found"
            return None, None
        
    def get_oldies(self, hxs):
        try:
            save = hxs.select('//span[@class="save"]/text()').extract()[0]
            old = hxs.select('//span[@class="oldPrice"]/text()').extract()[0]
            save = basic.get_price(save)
            old = basic.get_price(old)
        except:
            save = None
            old = None
        return [save], [old]
            
    def get_color_variants(self, hxs):
        try:
            variants = hxs.select('//script').re('productCode:\".*\d\"')[0].split(",")
            colors = hxs.select('//div[@class="gp_80-20a column"]//div[@class="innerColumn"]//fieldset//div//label//span/text()').extract()
            color_image_urls = hxs.select('//div[@class="gp_80-20a column"]//div[@class="innerColumn"]//fieldset//div//label//img//@src').extract()
            collect_points = []
            prices = []
            point_prices = []
            variant_ids = []
            for i in range(0, len(variants), 8):
                price = basic.get_price(variants[i+2])
                prices.append(price)
                points = str(int(float(price) * 100))
                point_prices.append(points)
                variant_id = basic.get_price(variants[i])
                variant_ids.append(variant_id)
                points = basic.get_price(variants[i+5])
                collect_points.append(points)
            return prices, point_prices, collect_points, colors, color_image_urls, variant_ids
        except:
            print "No color variants found"
            return None, None, None, None, None, None
            
    def get_size_variants(self, hxs):
        try:
            variants = hxs.select('//script').re('productCode:\".*\d\"')[0].split(",")
        except:
            print "No size variants found"
            return None, None, None, None, None
        sizes = hxs.select('//select[@id="size_x"]//option/text()').extract()[1:]
        collect_points = []
        prices = []
        point_prices = []
        variant_ids = []
        for i in range(7, len(variants), 7):
            price = basic.get_price(variants[i+2])
            prices.append(price)
            points = str(int(float(price) * 100))
            point_prices.append(points)
            variant_id = basic.get_price(variants[i+4])
            variant_ids.append(variant_id)
            points = basic.get_price(variants[i+1])
            collect_points.append(points)
        return prices, point_prices, collect_points, sizes, variant_ids
    
    def create_color_variants(self, prices, point_prices, colors, color_image_urls, variant_ids, collect_points, product_id):
        for i in range(0, len(colors)):
            variant = BootsItem()
            variant['master_product_id'] = product_id
            variant['product_id'] = [variant_ids[i]]
            variant['price'] = [prices[i]]
            variant['points_price'] = [point_prices[i]]
            variant['collect_points'] = [collect_points[0]]
            variant['color'] = [colors[i]]
            variant['color_image_url'] = ["43662980-f344-11e1-a21f-0800200c9a66/full/" + self.get_image_sha1(color_image_urls[i])]
            self.xml.create_xml(variant)
            
    def create_size_variants(self, prices, point_prices, sizes, variant_ids, collect_points, product_id):
        for i in range(0, len(sizes)):
            variant = BootsItem()
            variant['master_product_id'] = product_id
            variant['product_id'] = [variant_ids[i]]
            variant['price'] = [prices[i]]
            variant['points_price'] = [point_prices[i]]
            variant['collect_points'] = [collect_points[0]]
            variant['size'] = [sizes[i]]
            self.xml.create_xml(variant)
    
    def get_image_sha1(self, image_url):
        h = hashlib.sha1()
        h.update(image_url)
        return h.hexdigest()
    
    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}".format(datetime.now())
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total)
        # filename for writing xml"""
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        if self.d['upload']:
            exp = CommonExport()
            try:
                exp.xml_to_db(self.name, filename, "5097450b-2c49-49d4-b47a-55b1bc652c78")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        ## part for exporting to database here
        from modules.mail import Mail
        mail = Mail()
        """try:
            mail.send_mail(msg, "Boots: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "Boots: {0}".format(filename), self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)"""

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15)
            self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15)
            self.products['names'] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        self.products= xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

    def add_properties(self, xml):
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("store_id", "Store ID", "text")
        xml.add_property("lang_id", "Lang ID", "text")
        xml.add_property("catalog_id", "Catalog ID", "text")
        xml.add_property("order_id", "Order ID", "text")
        xml.add_property("cat_entry_id", "Cat Entry ID", "text")
        xml.add_property("calculation_usage_id", "Calculation Usage ID", "text")
        xml.add_property("ingredients", "Ingredients", "text")
        xml.add_property("patient_information_url", "Patient Information Url", "text")
        xml.add_property("points_price", "Points Price", "integer")
        xml.add_property("collect_points", "Collect Points", "integer")
        xml.add_property("brand_image_url", "Brand Image Url", "text")
        xml.add_property("description_1", "Description 1", "text")
        xml.add_property("description_2", "Description 2", "text")
        xml.add_property("description_3", "Description 3", "text")
        xml.add_property("description_4", "Description 4", "text")
        xml.add_property("description_5", "Description 5", "text")
        xml.add_property("description_6", "Description 6", "text")
        xml.add_property("sponsored", "Sponsored", "text")
        xml.add_property("offer", "Offer", "text")
        xml.add_property("promotion", "Promotion", "text")
        xml.add_property("old_price", "Old Price", "decimal")
        xml.add_property("save_money", "Save Money", "decimal")
        xml.add_property("price_per_size", "Price Per Size", "text")

Exemple #13

0

Afficher le fichier

Fichier : kenneth_spider.py Projet : marjevtic/testMarko

class KennethSpider(CrawlSpider):
    name = "kenneth"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(KennethSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.images_store = "/" + settings['IMAGES_STORE'] + "/"
        self.d = terminal.get_arguments()
        self.xml = VariantsXml()
        self.exc = ZmagsException(5)
        print self.d
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                        self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.no_url_products(self.no_urls)
        self.start_urls = self.products['urls'] 
        self.total = len(self.start_urls)

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = KennethItem()
        #main try for script, run general except if error happens in code (send
        # url on mail where it happened)
        try:
            cur_url = response.url
                # search for noResultContent div on the page, if it exists keep
                # track, that product doesn't exist on
                # their page, otherwise continue scraping page
            available = hxs.select('//div[@id="noResultsContent"]').extract()

            if not available:
                index = self.products['urls'].index(cur_url)
                cur_id = self.get_product_id(cur_url)
                id = self.products['product_ids'][index]
                page = hxs.select('//div[@id="mainContent"]').extract()
                page = " ".join(page)
                item['name'], item['description'] = self.get_basic_info(hxs)
                price, new_p, old_p = self.get_prices(hxs)
                if new_p:
                    item['new_price'] = new_p
                    item['old_price'] = old_p
                else:
                    item['price'] = price
                desc = basic.clean_string(item['description'][0])
                item['description'] = [desc]
                urls = self.get_color_image(hxs)
                new = self.get_image_server_path(urls, id)
                item['color_image_urls'] = new
                self.export(item['color_image_urls'], [id], "swatchImage")
                jsons, images = self.we_also_recommend(cur_id, id)
                item['product_page'] = [cur_url]
                item['product_id'] = [id]
                item['add_to_cart_id'] = [cur_id]
                item['recommended_product'] = jsons
                item['in_stock'] = ["IN_STOCK"]
                self.products['status'][index] = "ran"
                images_or_404 = self.get_colors(hxs, page, id)
                if images_or_404 == 404:
                    item['in_stock'] = ["NOT_AVAILABLE"]
                self.xml.create_xml(item)
                item['image_urls'] = []
                if images_or_404 != 404:
                    item['image_urls'] += images_or_404
                item['image_urls'] += urls
                item['image_urls'] += images
                #self.export(item['image_urls'])
                #item['image_urls'] = [] #uncomment for donwloading images 

            else:
                # part for handling products that are not available
                cur_id = self.get_product_id(cur_url)
                cur_url = "http://www.kennethcole.com/product/index.jsp?"
                cur_url += "productId=" + str(cur_id)
                index = self.products['urls'].index(cur_url)
                self.products['status'][index] = "no_avail"
                item['product_id'] = [self.products['product_ids'][index]]
                if self.products['product_ids'][index]:
                    item['name'] = [self.products['names'][index]]
                else:
                    item['name'] = ["not available"]
                item['in_stock'] = ["NOT_AVAILABLE"]
                self.xml.create_xml(item)
                self.exc.code_handler(102, cur_url)
        except:
            # part for catching errors and keeping track of numbers of
            # it and urls where it happened
            print "Error occured scraping this product"
            index = self.products['urls'].index(cur_url)
            self.products['status'][index] = "error"
            self.exc.code_handler(100, cur_url)
        return item

    def no_url_products(self, no_url):
        item = KennethItem()
        for n in no_url['product_ids']:
            item['product_id'] = [n]
            index = no_url['product_ids'].index(n)
            item['name'] = [no_url['names'][index]]
            item['in_stock'] = ['NOT_AVAILABLE']
            self.xml.create_xml(item)

    #function for getting basic product info from the page
    def get_basic_info(self, hxs):
        name = hxs.select('//div[@id="productInfoTop"]/h1/text()').extract()
        description = basic.cdata(hxs.select('//div[@id="productDescription"]').extract()[0])
        return name, [description]

    # function for getting prices from the page, nly one or new and old one if
    # that's the case
    def get_prices(self, hxs):
        price = hxs.select('//div[@id="productInfoTop"]/h2/text()').extract()[0]
        new_p = hxs.select('//h2[@class="sale-now"]/text()').extract()
        old_p = hxs.select('//span[@class="productGrey"]/text()').extract()
        price = re.sub('[^0-9.,]', '', price)
        return [price], new_p, old_p

    def get_color_image(self, hxs):
        return hxs.select('//div[@id="productInfoR2W"]/img/@src').extract()

    # function for gettng colors from javascript on the page, and writing them
    # in xml, from here is called function
    # for creating further sizes subproducts
    def get_colors(self, hxs, page, main_id):
        item = KennethItem()
        try:
            tmp = page.split('displays[0]')[1]
        except IndexError:
            print "This product is not available"
            return 404
        script = tmp.split('</script>')[0]
        displays = script.split("};")
        global counter
        ids = []
        images = []
        color_ids = []
        sizes_script = self.get_sizes_part_page(page)
        color_internal_code = {}

        for x in range(0, len(displays) - 1):
            id = basic.get_middle_text(displays[x], 'colorId: "', '"')
            ids.append(id[0])
            reg = displays[x].count("Reg")
            images_in = []
            for i in range(1, reg + 1):
                image = basic.get_middle_text(displays[x], "vw" + str(i) + 'Reg: "', '"')
                if len(image) == 0:
                    image = basic.get_middle_text(displays[x], "vw" + str(i) + 'Reg:"', '"')
                if (len(image) > 0):
                    if (image[0] != "null"):
                        images_in.append(image[0])

            if not images_in:
                images_in = hxs.select('//input[@name="productImage"]/@value').extract()
            color_ids.append(str(main_id) + "_" + str(x))
            item['product_id'] = [str(main_id) + "_" + str(x)]
            item['color_option_id'] = id
            item['master_product_id'] = [main_id]
            item['normal_image_url'] = self.get_image_server_path(images_in, main_id)
            item['thumb_image_url'] = self.get_image_server_path_thumb(images_in, main_id)
            item['in_stock'] = ["NOT_IN_STOCK"]
            item['color'] = self.get_color_name(sizes_script, id[0])
            color_internal_code[id[0]] = str(x)
            self.xml.create_xml(item)
            images += images_in
            self.export(item['normal_image_url'], item['product_id'], "productImage")
        self.get_sizes(sizes_script, ids, main_id, color_internal_code)
        return images

    # function for getting sizes for products from javascript, and storing 
    # information in dicts of format {id : information}
    def get_sizes(self, page, ids, main_id, color_internal_code):
        options = page.split("};")
        skus = {}
        colors_name = {}
        inStocks = {}
        sizes = {}
        prices = {}
        for x in range(0, len(options) - 1):
            id = basic.get_middle_text(options[x], 'cId: "', '"')
            for i in range(0, len(ids)):
                if (id[0] == ids[i]):
                    sku = basic.get_middle_text(options[x], 'sku: ', ',s')
                    sku = re.sub("[^0-9]", "", sku[0])
                    skus = self.add_to_dict(skus, ids[i], sku)
                    size = basic.get_middle_text(options[x], 'sDesc: "', '"')
                    sizes = self.add_to_dict(sizes, ids[i], size[0])
                    price = basic.get_middle_text(options[x], 'price: "', '"')
                    price = self.clean_price(price[0])
                    prices = self.add_to_dict(prices, ids[i], price[0])
                    available = basic.get_middle_text(options[x], 'avail: "', '"')
                    inStocks = self.add_to_dict(inStocks, ids[i], available[0])
        self.create_subproducts_xml(main_id, color_internal_code, colors_name, sizes, skus, inStocks, prices)
        return main_id, colors_name, sizes, skus, inStocks, prices

    # function for creating subproducts for every size
    def create_subproducts_xml(self, main_id, color_internal_code, colors_name, sizes, skus, inStocks, prices):
        number = 0
        global counter
        for k, v in sizes.iteritems():
            item = KennethItem()
            for i in range(0, len(v)):
                item['size'] = [v[i]]
                item['size_option_id'] = [skus[k][i]]
                m_id = main_id + "_" + color_internal_code[k]
                item['master_product_id'] = [m_id]
                id = m_id + "_" + str(i)
                item['product_id'] = [id]
                if inStocks[k][i] == "NOT_AVAILABLE":
                    item['in_stock'] = ["NOT_IN_STOCK"]
                elif inStocks[k][i] == "ADVANCED_SALE_LIMITED":
                    item['in_stock'] = ["IN_STOCK"]
                else:
                    item['in_stock'] = [inStocks[k][i]]
                item['price'] = [prices[k][i]]
                #item['color'] = colors_name[k]
                self.xml.create_xml(item)
            number += 1

    def add_to_dict(self, dict, index, value):
        try:
            dict[index].append(value)
        except:
            dict[index] = [value]
        return dict

    # function for getting we also recommend information about products from
    # their page, returns json list with information and images
    # list with images urls
    def we_also_recommend(self, id, main_id):
        url = "http://www.res-x.com/ws/r2/Resonance.aspx?appid=kennethcole01&t"
        url += "k=154212870918247&ss=525178103419747&sg=1&pg=897706724574618&b"
        url += "x=true&vr=2.67&sc=product_rr&ev=product&ei=" + id + "&cu=&ct=k"
        url += "ennethcolec01&no=3&cb=r1eh&clk=&cv1=" + id + "&cv23=63&ur=http%"
        url += "3A//www.kennethcole.com/product/index.jsp%3FproductId%3D3" + id
        url += "&plk=&rf="
        import urllib2
        page = urllib2.urlopen(url).read()
        temp = page.split("certonaRecBoxes")
        images = []
        ids = []
        names = []
        prices = []
        urls = []
        # parsing data got from the upper url about we also recommend products
        for i in range(1, len(temp)):
            id = [basic.get_middle_text(temp[i], "d=", '\\"')[0]]
            image = basic.get_middle_text(temp[i], 'src=\\"', '\\"')[0]
            name = basic.get_middle_text(temp[i], 'alt=\\"', '\\"')
            price = basic.get_middle_text(temp[i], '<br>', '</a>')
            url = "http://www.kennethcole.com/product/index.jsp?productId="
            url += id[0]
            urls.append(url)
            ids.append(id)
            names.append(name)
            prices.append(price)
            images.append(image)
        jsons = self.make_json(ids, names, prices, self.get_image_server_path(images, main_id), urls)
        return jsons, images

    # function for getting product id from the url
    def get_product_id(self, url):
        return url.split("=")[1]

    #function for making json
    def make_json(self, ids, names, prices, images, urls):
        jsons = []
        for i in range(0, len(ids)):
            json = "{" + ' "id" : "' + str(ids[i][0]) + '", '
            json += '"name" : "' + str(names[i][0]) + '", '
            # insert function for storing the right image path
            json += '"image_url" : "' + str(images[i]) + '", '
            json += '"product_url" : "' + urls[i] + '", '
            json += '"price" : "' + str(prices[i][0]) + '" } '
            json = basic.cdata(json)
            jsons.append(json)
        return jsons

    #function for getting javascript where sizes are handled
    def get_sizes_part_page(self, page):
        tmp = page.split("availDates = new Array();")[1]
        script = tmp.split("</script>")[0]
        return script

    # function for getting name of the color by id
    def get_color_name(self, script, id):
        temp = script.split(id)
        temp = temp[0].split('cDesc: "')
        temp = temp[len(temp) - 1]
        name = temp.split('"')[0]
        return [name]
        return {id: name}

    #function for exporting images to database via rest
    def export(self, images, id, tags):
        #set override to 0 for uploading images or else to skip uploading
        override = 1
        if override == 0:
            import MultipartPostHandler
            import urllib2
            import os
            url = 'http://api.admin.zmags.com/productImage/import?key=5ef90922-283b-4412-a1c8-3e70bc28b9d3'

            for i in range(0, len(images)):
                image_name = self.get_image_name(images[i])
                path = "images/kenneth_images/small/" + str(image_name)
                params = {'file': file(path, 'rb'), 'product_id': id[0],
                          'index': str(i + 1), 'tags': tags}
                          #token not working
                opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
                code = opener.open(url, params).getcode()

                if (code != 202):
                    print ("Achtung")
                global images_number
                images_number += 1
                print images_number

                print "Image uploaded to product " + id[0]
        else:
            #print "Image upload overriden.."
            pass

    #function for getting image name from url
    def get_image_server_path(self, urls, id):
#        print urls
        new = []
        for url in urls:
            temp = url.split("/")
            new.append(self.images_store + id + "/full/" + temp[len(temp) - 1])
        return new

    # function for getting image paths on our server
    def get_image_server_path_thumb(self, urls, id):
        new = []
        for url in urls:
            temp = url.split("/")
            new.append(self.images_store + id + "/small/" + temp[len(temp) - 1])
        return new

    def clean_price(self, price):
        return [re.sub('[^0-9.,]', '', price)]

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = ""
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped {0} product out of {1}\n\n".format(self.counter, self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        if self.d['upload']:
            exp = CommonExport()
            #try:
            exp.xml_to_db(self.name, filename, "29eac9ea-8c57-4d22-baf4-3f1471dc3ab6")
            msg += "\n\nExport to database successful"
            #except StandardError:
                #msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "KennethCole: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "KennethCole: {0}".format(filename), self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = 'logs/{0}'.format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products['urls'] = xls.read_excel_collumn_for_urls(2, 2)
            self.products['product_ids'] = xls.read_excel_collumn_for_ids(0, 2)
            self.products['names'] = xls.read_excel_collumn(1, 2)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        self.products = xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

    def add_properties(self, xml):
        xml.add_property("add_to_cart_id", "Add To Cart Id", "text")
        xml.add_property("product_page", "Product page", "text")
        xml.add_property("color_image_urls", "Color Image URLs", "text_list")
        xml.add_property("color_option_id", "Color Option ID", "text")
        xml.add_property("recommended_product", "Recommended Product", "text_list")
        xml.add_property("size_option_id", "Size Option ID", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("old_price", "Old Price", "text")
        xml.add_property("new_price", "New Price", "text")

Exemple #14

0

Afficher le fichier

class SportmanSpider(CrawlSpider):
    name = "sportman"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(SportmanSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5, "Sportmann")

        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(
                self.d['catalog_id'], self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.start_urls = self.products['urls']
        self.images_store = "/" + settings['IMAGES_STORE']
        self.total = len(self.start_urls)

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = SportmanItem()
        if 'redirect_urls' in response.request.meta:
            cur_url = response.request.meta['redirect_urls'][0]
        else:
            cur_url = response.url
        index = self.products['urls'].index(cur_url)
        try:
            if 'redirect_urls' in response.request.meta:
                item['product_id'] = [self.products['product_ids'][index]]
                item['name'] = [self.products['names'][index]]
                item['in_stock'] = ["NOT_AVAILABLE"]
                self.exc.code_handler(102, response.url)
                self.xml.create_xml(item)
                self.products["status"][index] = "no_avail"
            else:
                item["name"], item["short_desc"], item["description"], item[
                    "old_price"], item["custom_price"], item[
                        "product_id"], item["sku"] = self.get_basic_info(hxs)
                item['in_stock'] = ['IN_STOCK']
                viewstate, eventval, prevpage, hidden, view_page, even_page, pre_page, hidd_page = self.get_vars(
                    response, hxs)

                viewstate1 = viewstate[:2000]
                viewstate2 = viewstate[2000:4000]
                viewstate3 = viewstate[4000:6000]
                viewstate4 = viewstate[6000:8000]
                viewstate5 = viewstate[8000:10000]
                viewstate6 = viewstate[10000:]

                item["viewstate1"] = [basic.cdata(viewstate1)]
                item["viewstate2"] = [basic.cdata(viewstate2)]
                item["viewstate3"] = [basic.cdata(viewstate3)]
                item["viewstate4"] = [basic.cdata(viewstate4)]
                item["viewstate5"] = [basic.cdata(viewstate5)]
                item["viewstate6"] = [basic.cdata(viewstate6)]
                item["eventval"] = [basic.cdata(eventval)]
                item["size_options"] = self.get_variants(hxs, response)

                images_url = self.get_images(hxs)

                item["normal_image_url"] = self.get_server_path(images_url)

                self.xml.create_xml(item)
                item.clear()
                item['image_urls'] = self.get_images(hxs)
                self.products["status"][index] = "ran"
        except:
            self.exc.code_handler(100, response.url)
            self.products["status"][index] = "error"
        else:
            return item

    def get_basic_info(self, hxs):
        name = hxs.select('//div[@id="fragment-1"]/h2/text()').extract()

        short_desc = hxs.select(
            '//div[@class="description2"]/text()').extract()

        description = hxs.select(
            '//div[@id="fragment-1"]/div[@class="description"]').extract()
        description = sportman.delete_tags(re, description[0])
        description = [basic.cdata(description)]

        old_price = hxs.select('//span[@class="oldprice"]/text()').extract()
        if (old_price != []):
            old_price = " ".join(old_price)
            old_price = old_price.split(':')
            old_price = old_price[1].replace('Kr', '')
            old_price = [old_price.replace(" ", "")]
        else:
            old_price = old_price

        price = hxs.select('//span[@class="nowprice"]/text()').extract()
        if (price != []):
            price = " ".join(price)
            price = price.split(':')
            price = price[1].replace('Kr', '')
            price = [price.replace(" ", "")]
        else:
            price = hxs.select('//span[@class="normalprice"]/text()').extract()
            price = " ".join(price)
            price = price.split(':')
            price = price[1].replace('Kr', '')
            price = [price.replace(" ", "")]

        id = hxs.select('//div[@class="articlenumber"]').extract()
        id = " ".join(id)
        id = id.replace(u"\xa0", "")
        id = basic.get_middle_text(id, 'Art.nr.', '</div>')
        sku = id
        id = [id[0]]

        return name, short_desc, description, old_price, price, id, sku

    def get_vars(self, response, hxs):
        headers1 = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 5.1; rv:13.0) Gecko/20100101 Firefox/13.0.1',
            'Host':
            'www.sportmann.no',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language':
            'en-us,en;q=0.5',
            'Accept-Charset':
            'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
            'Connection':
            'keep-alive',
            'Referer':
            '/product.aspx?productid=613232',
            'Cookie':
            'ASP.NET_SessionId=lurvsvrn3jxsfd45cedmsv45; Besok=922884e3-e9cb-4b69-b8c8-215f3cc988a9; __utma=184084580.1353376623.1312483243.1312483243.1312483243.1; __utmb=184084580.9.10.1312483243; __utmc=184084580; __utmz=184084580.1312483243.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)'
        }

        page = hxs.select('//html').extract()
        page = " ".join(page)

        viewst = basic.get_middle_text(page, 'id="__VIEWSTATE" value="', '"')
        eventval = basic.get_middle_text(page,
                                         'id="__EVENTVALIDATION" value="', '"')
        prevpage = [""]
        hidden_field = [""]

        r = requests.get(response.url, headers=headers1)

        page_one = r.content

        viewst_page = basic.get_middle_text(page_one,
                                            'id="__VIEWSTATE" value="', '"')
        eventval_page = basic.get_middle_text(
            page_one, 'id="__EVENTVALIDATION" value="', '"')
        prevpage_page = basic.get_middle_text(page_one,
                                              'id="__PREVIOUSPAGE" value="',
                                              '"')
        hidden_temp = page_one.split('id="__VIEWSTATE"')
        hidden_temp = hidden_temp[1].split('id="__PREVIOUSPAGE"')
        hidden_temp = hidden_temp[0].split('<script sr')

        val_x = len(hidden_temp) - 1

        hidden_temp = basic.get_middle_text(hidden_temp[val_x], 'c="', '"')
        hidden_temp_val = hidden_temp[0]
        hidden_temp_val = hidden_temp_val.replace('amp;', '')
        hidden_url = "http://www.sportmann.no" + hidden_temp_val

        request_hidden = urllib2.Request(hidden_url)
        response_hidden = urllib2.urlopen(request_hidden)
        hidden_field_page = basic.get_middle_text(
            response_hidden.read(),
            "ctl00_ScriptManager1_HiddenField').value += '", "';")

        return viewst[0], eventval[0], prevpage[0], hidden_field[
            0], viewst_page[0], eventval_page[0], prevpage_page[
                0], hidden_field_page[0]

    def get_variants(self, hxs, response):
        page = hxs.select('//html').extract()
        page = " ".join(page)
        dict_one = {}
        test_one = []

        temp = page.split('<div class="color">')
        temp = temp[1].split('</div>')
        temp = temp[0].split('<select name')

        viewstate, eventvalidation, previouspage, hiddenfield, view_page, even_page, pre_page, hidd_page = self.get_vars(
            response, hxs)

        if (len(temp) == 1):
            color = hxs.select('//div[@class="color"]/text()').extract()
            value = hxs.select(
                '//input[@id="ctl00_ContentPlaceHolder1_Variant1Hidden"]/@value'
            ).extract()
            color[0] = color[0].replace("  ", "")
            color = basic.clean_string(color[0])
            value = value[0]

        #            color = basic.clean_string(color[0])
        #            color = color.replace("  ","")
        #
        #            dict['color'] = color
        #            dict['color_value'] = value[0]

        else:
            test_color = basic.get_middle_text(temp[1], 'farge</option>',
                                               '</select>')
            color = basic.get_middle_text(test_color[0], '">', '</option>')
            value = basic.get_middle_text(test_color[0], 'value="', '">')

            for i in range(0, len(color)):
                color[i] = color[i].replace("  ", "")
            #
            #                dict['color'] = color
            #                dict['color_value'] = value

        size_temp = page.split('<div class="size">')
        size_temp = size_temp[1].split('</div>')
        size_temp = size_temp[0].split('<select name')

        if (len(size_temp) == 1):
            size = hxs.select('//div[@class="size"]/text()').extract()
            size = basic.clean_string(size[0])
            size = [size.replace("   ", "")]

            size_val = hxs.select(
                '//input[@id="ctl00_ContentPlaceHolder1_Variant2Hidden"]/@value'
            ).extract()

            if size[0] == "":
                for i in range(len(value)):
                    resp_page = self.get_data(response, hidd_page, view_page,
                                              pre_page, even_page, value[i])

                    a_page = resp_page.split('<div class="siz')
                    a_page = a_page[1].split('</select>')

                    if len(a_page) == 1:

                        size = basic.get_middle_text(a_page[0], 'e">',
                                                     '<input type="hidden"')
                        size_val = basic.get_middle_text(
                            a_page[0], 'value="', '"')
                        size_val = size_val[0]
                        size_val = [size_val]

                    else:
                        a_page = basic.get_middle_text(a_page[0],
                                                       'se</option>',
                                                       '</select>')
                        size = basic.get_middle_text(a_page[0], '">',
                                                     '</option>')
                        size_val = basic.get_middle_text(
                            a_page[0], 'value="', '">')

                    dict_one["color"] = color[i]
                    dict_one["color_value"] = value[i]
                    dict_one["size_value"] = size_val

                    for x in range(0, len(size)):
                        size[x] = basic.clean_string(size[x])
                        size[x] = size[x].replace("   ", "")

                        dict_one["size"] = size

                    test_one.append(basic.cdata(json.dumps(dict_one)))

            else:
                dict_one["color"] = color

                dict_one["color_value"] = value
                dict_one['size'] = size
                dict_one['size_value'] = size_val
                test_one.append(basic.cdata(simplejson.dumps(dict_one)))

        else:
            test_size = basic.get_middle_text(size_temp[1], 'se</option>',
                                              '</select>')
            size = basic.get_middle_text(test_size[0], '">', '</option>')
            size_val = basic.get_middle_text(test_size[0], 'value="', '">')

            for x in range(0, len(size)):
                size[x] = basic.clean_string(size[x])
                size[x] = size[x].replace("   ", "")

            dict_one["color"] = color
            dict_one["color_value"] = value
            dict_one['size'] = size
            dict_one['size_value'] = size_val

            test_one.append(basic.cdata(json.dumps(dict_one)))

        return test_one

    def get_server_path(self, url):
        images_array = []
        for i in range(0, len(url)):
            url[i] = basic.clean_string(url[i])

            images_array.append(self.images_store + "/full/" +
                                hashlib.sha1(url[i]).hexdigest() + ".jpg")

        return images_array

    def get_images(self, hxs):
        page = hxs.select('//html').extract()
        page = " ".join(page)

        images = []

        temp = page.split('class="gallery_demo_unstyled"')
        temp = temp[1].split('<div class="right_container">')
        temp = basic.get_middle_text(temp[0], 'src="', '"')

        for i in range(0, len(temp)):
            image_url = "http://www.sportmann.no" + temp[i]
            images.append(image_url)

        return images

    def get_data(self, response, hidden, viewstate, previouspage,
                 eventvalidation, colorvalue):
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0',
            'Host': 'www.sportmann.no',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-us,en;q=0.5',
            'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
            'Connection': 'keep-alive',
            'Referer': 'http://www.sportmann.no/product.aspx?productid=613232',
            'Cookie': ''
        }

        eventvalidation = urllib.urlencode(
            {"__EVENTVALIDATION": eventvalidation})
        viewstate = urllib.urlencode({"__VIEWSTATE": viewstate})
        previouspage = urllib.urlencode({"__PREVIOUSPAGE": previouspage})
        hidden = urllib.urlencode({"ctl00_ScriptManager1_HiddenField": hidden})

        data = "ctl00%24ScriptManager1=ctl00%24ContentPlaceHolder1%24dropdownPanel%7Cctl00%24ContentPlaceHolder1%24ddlVariant&" + hidden + "%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0&__EVENTTARGET=ctl00%24ContentPlaceHolder1%24ddlVariant&__EVENTARGUMENT=&__LASTFOCUS=&" + viewstate + "&" + previouspage + "&" + eventvalidation + "&ctl00%24ProductSearch%24txtProdSearch=&ctl00%24ProductSearch%24TextBoxWatermarkProdSearch_ClientState=&ctl00%24ContentPlaceHolder1%24ddlVariant=" + colorvalue + "&ctl00%24ContentPlaceHolder1%24Variant1Hidden=&ctl00%24ContentPlaceHolder1%24Variant2Hidden=&ctl00%24ContentPlaceHolder1%24tbAmount=1&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtFriendsName=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceFriendsName_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtFriendsEmail=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceFriendsEmail_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtYourName=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceYourName_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtYourEmail=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceYourEmail_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtComment=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceComment_ClientState=&__ASYNCPOST=true&"

        #r = requests.get(response.url, h)
        req = urllib2.Request(response.url, data, headers)

        resp_page = urllib2.urlopen(req).read()

        return resp_page

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}".format(datetime.now())
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter,
                                                         self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        if self.d['upload']:
            exp = CommonExport()
            try:
                exp.xml_to_db(self.name, filename,
                              "1ccd39a5-af4e-47cc-aebe-e0dede5b14d8")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "Sportmann: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "Sportmann: {0}".format(filename),
                               self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15)
            self.products["product_ids"] = xls.read_excel_collumn_for_ids(
                1, 15)
            self.products["names"] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        self.products = xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

    def add_properties(self, xml):
        xml.add_property("short_desc", "Short Description", "text")
        xml.add_property("old_price", "Old Price", "text")
        xml.add_property("custom_price", "New Price", "text")
        xml.add_property("color_value", "Color Value", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("size_val", "Size Value", "text_list")
        xml.add_property("sku", "Sku", "text")
        xml.add_property("size_options", "Size_options", "text_list")
        xml.add_property("viewstate1", "Viewstate1", "text_list")
        xml.add_property("viewstate2", "Viewstate2", "text_list")
        xml.add_property("viewstate3", "Viewstate3", "text_list")
        xml.add_property("viewstate4", "Viewstate4", "text_list")
        xml.add_property("viewstate5", "Viewstate5", "text_list")
        xml.add_property("viewstate6", "Viewstate6", "text_list")
        xml.add_property("eventval", "Eventval", "text_list")
        xml.add_property("hidden", "Hidden Field", "text_list")
        xml.add_property("prevpage", "Previous Page", "text_list")
        xml.add_property("recommended_product", "Recommended Product",
                         "text_list")