Ejemplo n.º 1
0
class GuitarCenterSpider(CrawlSpider):
    name = "guitar_center"
    allowed_domains = ["musiciansfriend.com"]
    start_urls = ["http://www.musiciansfriend.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(GuitarCenterSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5)
        if self.d["database"]:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d["catalog_id"], self.d["product_id"])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.handle_not_provided()
        self.start_urls = self.products["urls"]
        self.total = len(self.products["urls"])

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = GuitarCenterItem()
        from scrapy.conf import settings

        if "redirect_urls" in response.request.meta:
            cur_url = response.request.meta["redirect_urls"][0]
        else:
            cur_url = response.url
        index = self.products["urls"].index(cur_url)
        try:
            item["product_id"] = [self.products["product_ids"][index]]
            item["name"], item["brand"] = self.get_basic_info(hxs)
            item["heading"], item["details"], item["specs"], item["call_to_action"] = self.get_description(hxs)
            item["brand_image"], item["brand_image_promo"], brand_images = self.get_description_images(hxs)
            item["old_price"], item["discount"], item["price"] = self.get_prices(hxs)
            item["image_json"], img = self.get_images(hxs)
            item["serial"] = self.get_serials(hxs)
            item["warranty"] = self.gold_coverage(hxs)
            item["in_stock"] = self.get_available(hxs)
            item["product_ref"], item["add_to_cart_id"] = self.get_add_to_cart(hxs)
            if not item["add_to_cart_id"]:
                item["in_stock"] = ["NOT_AVAILABLE"]
            item["shipping"] = self.get_shipping(hxs)
            item["colors"] = self.get_colors(hxs)
            self.products["status"][index] = "ran"
        except StandardError:
            self.products["status"][index] = "error"
            self.exc.code_handler(100, response.url)
        else:
            self.xml.create_xml(item)
            item["image_urls"] = img + brand_images
        return item

    def handle_not_provided(self):
        item = GuitarCenterItem()
        for n in self.no_urls["product_ids"]:
            item["product_id"] = [n]
            index = self.no_urls["product_ids"].index(n)
            item["name"] = [self.no_urls["names"][index]]
            item["in_stock"] = ["NOT_AVAILABLE"]
            self.xml.create_xml(item)

    def get_basic_info(self, hxs):
        name = hxs.select('//h1[@class="fn"]/text()').extract()
        name = [basic.clean_string("".join(name))]
        brand = hxs.select('//span[@class="brand"]/text()').extract()
        name = [name[0].replace(u"\xa0", "")]
        return name, brand

    def get_description_images(self, hxs):
        brand_image = hxs.select('//a[@class="brandImage"]/img/@src').extract()
        brand_image_promo = hxs.select('//div[@class="brandPromoLogo"]/img/@src').extract()
        images = brand_image + brand_image_promo
        if brand_image:
            brand_image = [self.get_server_path(brand_image[0])]
        if brand_image_promo:
            brand_image_promo = [self.get_server_path(brand_image_promo[0])]
        return brand_image, brand_image_promo, images

    def get_description(self, hxs):
        heading = hxs.select('//div[@id="description"]/p').extract()
        details = hxs.select('//p[@class="description"]').extract()
        specs = hxs.select('//div[@class="specs"]/ul').extract()
        last = hxs.select('//div[@class="callToAction"]/p/text()').extract()
        return basic.cdata_field(heading), basic.cdata_field(details), basic.cdata_field(specs), basic.cdata_field(last)

    # function for getting prices, returns tags and values or empty field if no option for one of them new is discount
    def get_prices(self, hxs):
        tag = hxs.select('//dl[@class="lineItemList"]/dt/text()').extract()
        value = hxs.select('//dl[@class="lineItemList"]/dd/text()').extract()
        old_price = []
        discount = []
        price = []
        if len(tag) > 1:
            old_price = [basic.clean_string(value[0])]
        try:
            discount = [basic.clean_string(value[len(value) - 1])]
        except IndexError:
            print "This product has no price."
        try:
            price = hxs.select('//span[@class="topAlignedPrice"]/text()').extract()
        except IndexError:
            print "This product has no price."
        if not old_price and not discount and not price:
            price = hxs.select('//dl[@class="inlineList"]/dd/text()').extract()
        return self.clean_price(old_price), self.clean_price(discount), self.clean_price(price)

    # returning json with image url and serial number of product image refers to
    def get_images(self, hxs):
        images = hxs.select('//ul[@id="prodDetailThumbs"]/li/a/@href').extract()
        tags = hxs.select('//ul[@id="prodDetailThumbs"]/li/@class').extract()
        images_list = []
        d = {}
        img = []
        for i in range(0, len(images)):
            d["image_url"] = self.get_server_path(images[i])
            img.append(images[i])
            if "site1sku" in tags[i]:
                d["product_serial"] = tags[i].replace("site1sku", "")
            else:
                d["product_serial"] = tags[i]
            images_list.append(basic.cdata(simplejson.dumps(d)))
        return images_list, img

    # function for getting serials and all information about them, currently returns field with jsons with all
    # information, can be modified to return dicts if needed for subproducts for those one day
    def get_serials(self, hxs):
        serials = hxs.select('//var[@class="hidden styleInfo"]/text()').extract()
        new = []
        for serial in serials:
            d = simplejson.loads(serial)
            new.append(basic.cdata(simplejson.dumps(d)))
        return new

    def get_server_path(self, url):
        # uncomment next line if you want to keep absolute image path from their site
        return url
        return IMAGES_STORE + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg"

    # function for getting gold coverage from the page which is actually additional warranty options
    def gold_coverage(self, hxs):
        ids = hxs.select('//div[@class="goldCoverage"]/input[@type="checkbox"]/@value').extract()
        labels = hxs.select('//div[@class="goldCoverage"]/label/text()').extract()
        d = {}
        new = []
        for i in range(0, len(ids)):
            d["id"] = ids[i]
            d["name"] = labels[i]
            new.append(basic.cdata(simplejson.dumps(d)))
        return new

    # function for getting availability
    def get_available(self, hxs):
        p = hxs.select('//var[@class="hidden availability"]/text()').extract()
        if p:
            if p[0] == "in_stock":
                p = [p[0].upper()]
        else:
            # for those that have color options and in stock status for each of those
            # put IN_STOCK for the product as it has no that option on the page
            p = ["IN_STOCK"]
        return p

    # function for getting add to cart id and product reference
    def get_add_to_cart(self, hxs):
        try:
            temp = hxs.select('//span[@class="magicLink addToList"]/@data-rel').extract()[0]
        except:
            print "Product not available"
        else:
            return [temp.split("|")[0]], [temp.split("|")[1]]
        return [], []

    # function for gatting shipping information
    def get_shipping(self, hxs):
        return hxs.select('//div[@id="targeter_pdpShipping"]/span/text()').extract()

    # function for getting colors, return jsons with all the data about options
    def get_colors(self, hxs):
        colors = hxs.select('//var[@class="styleInfo"]/text()').extract()
        new = []
        for color in colors:
            d = simplejson.loads(color)
            new.append(basic.cdata(simplejson.dumps(d)))
        return new

    # cleaning price to leave only numbers
    def clean_price(self, price):
        new = []
        for i in price:
            new.append(re.sub("[^0-9.]", "", i))
        return new

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}".format(datetime.now())
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total)
        # filename for writing xml
        if self.d["database"]:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d["catalog_id"])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d["file"]
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        if self.d["upload"]:
            exp = CommonExport()
            try:
                exp.xml_to_db(self.name, filename, "4a9f5955-9b8e-4e13-84ef-95f937dbc00d")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        ## part for exporting to database here
        from modules.mail import Mail

        mail = Mail()
        try:
            mail.send_mail(msg, "GuitarCenter: {0}".format(filename))
            if self.d["email"]:
                mail.send_mail(msg, "GuitarCenter: {0}".format(filename), self.d["email"])
        except:
            msg += "\nSending mail failed."
        if self.d["database"]:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), "w") as f:
                f.write(msg)

    def add_properties(self, xml):
        xml.add_property("old_price", "Old Price", "decimal")
        xml.add_property("image_json", "Image Json", "text_list")
        xml.add_property("discount", "Discount", "decimal")
        xml.add_property("product_ref", "Product Ref.", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("serial", "Serial", "text_list")
        xml.add_property("colors", "Colors", "text_list")
        xml.add_property("add_to_cart_id", "Add To Cart ID", "text")
        xml.add_property("shipping", "Shipping", "text")
        xml.add_property("warranty", "Warranty", "text_list")
        xml.add_property("heading", "Heading", "text")
        xml.add_property("details", "Details", "text")
        xml.add_property("specs", "Specs", "text")
        xml.add_property("call_to_action", "Call To Action", "text")
        xml.add_property("brand_image", "Brand Image", "text")
        xml.add_property("brand_image_promo", "Brand Image Promo", "text")

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d["file"]))
        self.products = dict()
        try:
            self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15)
            self.products["product_ids"] = xls.read_excel_collumn_for_ids(1, 15)
            self.products["names"] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d["file"])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d["file"])
            self.exc.code_handler(103, msg=msg)
        self.products = xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)
Ejemplo n.º 2
0
class LydiasSpider(CrawlSpider):
    name = "lydias"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(LydiasSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = VariantsXml()
        self.exc = ZmagsException(5)
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                        self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        # fix for bug with links they provide
        self.products['urls'] = basic.cut_string_field(self.products['urls'], "&cat=")
        self.handle_not_provided()
        self.start_urls = self.products['urls']
        self.images_store = "/" + settings['IMAGES_STORE']
        lydias.add_properties(self.xml)
        self.total = len(self.products['urls'])

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = LydiasItem()
        if 'redirect_urls' in response.request.meta:
            cur_url = response.request.meta['redirect_urls'][0]
        else:
            cur_url = response.url
        index = self.products['urls'].index(cur_url)
        id = self.products['product_ids'][index]
        try:
            available = hxs.select('//div[@id="searchfor"]/text()').extract()
            if not available:
                item['product_id'] = [id]
                item['name'], item['price'], item['old_price'], item['description'] = self.get_basic_info(hxs)
                item['rating'], item['custom_rating'] = self.get_rating(hxs)
                chart = self.absolute_path(self.get_size_image(hxs))
                item['sizes_chart_image_url'] = self.get_server_path(chart)
                color_urls, color_names, product_image, color_codes = self.get_image_swatches(hxs)
                color_urls = self.absolute_path(color_urls)
                item['color_image_url'] = self.make_colors_json(color_urls, color_names, color_codes)
                item['in_stock'] = ["IN_STOCK"]
                item['embroidery'] = self.get_embroidery(hxs)
                default_images = self.absolute_path(self.get_extra_images(hxs))
                item['default_image_url'] = self.get_server_path(default_images)
                self.xml.create_xml(item)
                product_image = self.absolute_path(product_image)
                self.create_subproducts(id, color_names, product_image, color_codes, hxs)
                item['image_urls'] = product_image + color_urls + chart + default_images
                self.products['status'][index] = "ran"
            else:
                self.exc.code_handler(102, response.url)
                item['product_id'] = [id]
                item['in_stock'] = ["NOT_AVAILABLE"]
                self.products['status'][index] = "not_avail"
                self.xml.create_xml(item)
        except:
            self.products['status'][index] = "error"
            self.exc.code_handler(100, response.url)
        return item

     # function for checking if product has embroidery or not
    def get_embroidery(self, hxs):
        page = hxs.select('//html').extract()[0]
        if "document.getElementById('logocolor').disabled = true;" in page:
            return ["True"]
        else:
            return ["False"]

    # function for creating json with all information for colors
    def make_colors_json(self, color_urls, color_names, color_codes):
        dict = {}
        jsons = []
        for i in range(0, len(color_urls)):
            dict['color_url'] = self.get_server_path_single(color_urls[i])
            dict['color_name'] = color_names[i]
            dict['color_short'] = color_codes[i]
            json = basic.cdata(simplejson.dumps(dict))
            jsons.append(json)
        return jsons

    # function for getting image server path
    def get_server_path_single(self, url):
#        return url
        return self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg"

    # function for getting image path for field of images
    def get_server_path(self, urls):
#        return urls
        new = []
        for url in urls:
            new.append(self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg")
        return new

    #function for getting basic information for product
    def get_basic_info(self, hxs):
        name = hxs.select('//div[@id="proddetail"]/h1/text()').extract()
        price = hxs.select('//div[@id="proddetail"]/div[@class="yourprice bigprice"]/text()').extract()
        description = basic.cdata(hxs.select('//div[@id="details"]').extract()[0])
        description = basic.clean_string(description)
        old_price = hxs.select('//span[@class="yourprice_product"]/text()').extract()
        if not price:
            price = hxs.select('//span[@id="PriceDisplay"]/text()').extract()
        if old_price:
            old_price = [re.sub('[^0-9.]', '', old_price[0])]
        price = [re.sub('[^0-9.]', '', price[0])]
        return name, price, old_price, [description]

    # function for getting rating, both number and sentence (e.g. Rating 5 out of 6 votes)
    def get_rating(self, hxs):
        temp = hxs.select('//div[@id="Customerssay"]/p[2]/text()').extract()
        if temp:
            rating = basic.get_middle_text(temp[0].replace(" ", ""), "Rating:", "out")
            return rating, temp
        else:
            return [], temp

    #function for getting reviews, returning rating and field of json reviews
    # or empty fields if there's no reviews
    def get_reviews(self, hxs):
        reviews = hxs.select('//div[@class="prodReview"]')
        if reviews:
            title = reviews[0].select('p[@class="review_title"]/text()').extract()
            text = reviews[0].select('p[@class="review_text"]/text()').extract()
            author = reviews[0].select('p[@class="review_author"]/text()').extract()
            location = reviews[0].select('p[@class="review_location"]/text()').extract()
            jsons = self.make_reviews_json(title, text, author, location)
            return jsons
        else:
            return []

    # function for making json for reviews
    # currently not in use. cause there are no reviews in DPW design
    def make_reviews_json(self, title, text, author, location):
        jsons = []
        print len(title)
        print len(text)
        print len(author)
        print len(location)
        os._exit(0)
        for i in range(0, len(title)):
            json = '{ "title" : " %s ", "text" : "%s", "author" : "%s", "location" :\
                    "%s" }' % (title[i], text[i], author[i], location[i])
            json = basic.cdata(json)
            jsons.append(json)
        return jsons

    #function for getting size chart image
    def get_size_image(self, hxs):
        temp = hxs.select('//div[@class="TabbedPanelsContent cells"]/img/@src').extract()
        return temp

    #function for getting image swatches, returning fields (image_urls, image name, product color image)
    def get_image_swatches(self, hxs):
        colors = hxs.select('//div[@class="lolite"]')
        color_images = []
        color_names = []
        products_image = []
        color_codes = []
        for color in colors:
            color_images.append(color.select('a/img/@src').extract()[0])
            color_names.append(color.select('a/img/@alt').extract()[0])
            #if zoom image needed, this is the place to get it
            products_image.append(color.select('a/@rev').extract()[0])
            color_codes.append(color.select('a/@onclick').extract()[0].split(",")[1].replace("'", ""))
        return color_images, color_names, products_image, color_codes

    #function for getting additional images, returns field of images or empty field if there is no
    def get_extra_images(self, hxs):
        additional_images = hxs.select('//div[@id="AddImg"]/script/text()').extract()
        if additional_images:
            temp = basic.get_middle_text(additional_images[0], '"', '"')
            thumb_images = temp[0].split(",")
            return thumb_images
        else:
            return []

    #function for getting product id from the page
    def get_product_id(self, hxs):
        temp = hxs.select('//div[@id="wrap"]/script/text()').extract()
        id = basic.get_middle_text(temp[0], 'productid","', '"')
        return id[0]

    # function for getting sizes from another url, retunrning field of jsons for sizes
    # one id from the page is 115NB, if needed here to hardcode for testing
    # currently not in use
    def get_sizes(self, id, hxs):
        showmode = hxs.select('//input[@name="showmode"]/@value').extract()[0]
        itemmode = hxs.select('//input[@name="itemmode"]/@value').extract()[0]
        salemode = hxs.select('//input[@name="salemode"]/@value').extract()[0]
        url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=AV&opt2=-1&type2=l1type" % (id)
        url += "&type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=429" % (showmode, itemmode, salemode)
        jsons = []
        print "reading page..."
        page = urllib2.urlopen(url).read()
        print "page read"
        page = page.replace("'", "")
        page = page.replace("[", ",")
        page = page.replace(",,", "")
        temp = page.split("]")
        for i in range(0, len(temp) - 2):
            tmp = temp[i].split(",")
            json = '{ "size_short" : " %s ", "size_full" : "%s", "some_number" :\
                    "%s", "some_id" : "%s" }' % (tmp[0], tmp[1], tmp[2], tmp[3])
            json = basic.cdata(json)
            jsons.append(json)
        return jsons

    # function that handles creating subproducts, can be implemented for the usual way product for every combination
    # of size and color if needed
    def create_subproducts(self, id, color_names, product_image, color_codes, hxs):
        item = LydiasItem()
        # if no colors for specific product do this part and call to creating size children with empty string instead
        # of actual color name
        if len(color_names) == 0:
            item['master_product_id'] = [id]
            item['product_id'] = [id + "_" + "0"]
            item['color'] = ["NO_COLOR"]
            item['custom_size'] = self.create_sizes_subproducts(id, id + "_" + "0", "", hxs)
            self.xml.create_xml(item)

        # for handling cases when there are color options for specific product, create child for every color, and call
        # for creating size children for every provided color
        else:
            for i in range(0, len(color_names)):
                print "name :" + color_names[i] + "  code:" + color_codes[i]
                item['master_product_id'] = [id]
                item['product_id'] = [id + "_" + str(i)]
                item['color'] = [color_names[i]]
                item['color_short'] = [color_codes[i]]
                item['normal_image_url'] = self.get_server_path([product_image[i]])
                item['in_stock'] = ["IN_STOCK"]
                item['custom_size'] = self.create_sizes_subproducts(id, id + "_" + str(i), color_codes[i], hxs)
                self.xml.create_xml(item)
                item.clear()
        return 0

    # function for creating child products for sizes
    # little messy with all the commented lines but those lines can be used if needed to go back to old way with
    # child products instead of json
    def create_sizes_subproducts(self, main_id, id, color_code, hxs):
        print color_code
        jsons = []
        # if block for cases when color is provided
        if color_code != "":
            showmode = hxs.select('//input[@name="showmode"]/@value').extract()[0]
            itemmode = hxs.select('//input[@name="itemmode"]/@value').extract()[0]
            salemode = hxs.select('//input[@name="salemode"]/@value').extract()[0]
            url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=%s&opt2=-1&type2=l1type&" \
                "type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=193" % (main_id, color_code, showmode, itemmode, salemode)
            page = urllib2.urlopen(url).read()
            page = page.replace("'", "")
            page = page.replace("[", ",")
            page = page.replace(",,", "")
            temp = page.split("]")
            for i in range(0, len(temp) - 2):
                tmp = temp[i].split(",")
                item = {}
#                item['master_product_id'] = [id]
                item['size_short'] = tmp[0]
                item['price_url'] = self.get_size_price(str(main_id), str(color_code), tmp[0])
                item['size'] = tmp[1]
#                item['product_id'] = [id + "_" + str(i)]
#                item['in_stock'] = ["IN_STOCK"]
#                xml.create_xml(item)
                jsons.append(basic.cdata(simplejson.dumps(item)))
            return jsons

        # when the color is not provided different block of code cause it's done differently on the page
        else:
            temp = hxs.select('//div[@class="not_size"]/text()').extract()
            for i in range(0, len(temp)):
                item = {}
#                item['master_product_id'] = [id]
#                item['product_id'] = [id + "_" + str(i)]
                item['size_short'] = temp[i]
                item['price_url'] = self.get_size_price(str(main_id), "", temp[i])
#                item['in_stock'] = ["IN_STOCK"]
#                xml.create_xml(item)
                jsons.append(basic.cdata(simplejson.dumps(item)))
            return jsons

#        return 0

    # function for getting price for combination of every size and color, can return url where the price is, or can
    # parse that url to get that actual price but will drastically increase scraping time
    def get_size_price(self, id, color, size):
        if color != "":
            url = "http://www.lydiasuniforms.com/ajaxed/product-showprice.asp?sku=%s %s %s&qty=1&itemmode=" \
                  "0&showmode=1&rnum=388" % (str(id), str(color), size)
        else:
            url = "http://www.lydiasuniforms.com/ajaxed/product-showprice.asp?sku=%s %s&qty=1&itemmode=" \
                  "0&showmode=1&rnum=259" % (id, size)
        url = url.replace(" ", "%20")
        return url

    # just adding part for getting absolute paths for relative paths from page
    def absolute_path(self, urls):
        new = []
        for i in urls:
            new.append("http://www.lydiasuniforms.com" + i)
        return new

    # function used for gettin embroidery information from clients page, was used only once to get it
    # cause embroidery is the same for all the products
    def get_emb(self, hxs):
        emb = hxs.select('//div[@id="emb"]').extract()
        lettering_colors = hxs.select('//select[@id="threadcolor"]/option/@value').extract()
        urls = []
        d = {}
        colors = []
        for i in range(1, len(lettering_colors)):
            d['type'] = "lettering colors"
            d['name'] = lettering_colors[i]
            url = "http://www.lydiasuniforms.com/images/lydias/threadcolor_"
            url += lettering_colors[i].lower().replace(' ', '_') + ".gif"
            d['url'] = self.get_server_path_single(url)

            urls.append(url)
            colors.append(basic.cdata(simplejson.dumps(d)))
        lettering = hxs.select('//select[@id="lettering"]/option/@value').extract()
        l = {}
        letterings = []
        for i in range(1, len(lettering)):
            l['type'] = "lettering"
            l['name'] = lettering[i]
            url = "http://www.lydiasuniforms.com/images/lydias/lettering_"
            url += lettering[i].lower().replace(' ', '_') + ".gif"
            l['url'] = self.get_server_path_single(url)
            letterings.append(basic.cdata(simplejson.dumps(l)))
            urls.append(url)
        logo = hxs.select('//select[@id="logoname"]/option/@value').extract()
        logos = {}
        log = []
        for i in range(1, len(logo)):
            logos['type'] = "logo"
            logos['name'] = logo[i]
            url = "http://www.lydiasuniforms.com/images/logos/"
            url += logo[i].lower() + ".jpg"
            logos['url'] = self.get_server_path_single(url)
            urls.append(url)
            log.append(basic.cdata(simplejson.dumps(logos)))
        item = LydiasItem()
        item['color'] = colors
        item['lettering'] = letterings
        item['log'] = log
        xml.create_xml(item)
        xml.write_xml("emb")

        return urls
        print  colors, letterings, log
        os._exit(0)

    def handle_not_provided(self):
        item = LydiasItem()
        for n in self.no_urls['product_ids']:
            item['product_id'] = [n]
            index = self.no_urls['product_ids'].index(n)
            item['name'] = [self.no_urls['names'][index]]
            item['in_stock'] = ['NOT_AVAILABLE']
            self.xml.create_xml(item)

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = ""
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        #if self.d['upload']:
            #exp = CommonExport()
            #try:
                #exp.xml_to_db(self.name, filename, "4b0d6b52-7b05-4e54-9d87-dfe77ac270c9")
                #msg += "\n\nExport to database successful"
            #except StandardError:
                #msg += "\n\nExport to database failed"
        #else:
            #msg += "\n\nUpload to database not selected"
        ## part for exporting to database here
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "Lydias: {0}".format(filename))
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15)
            self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15)
            self.products['names'] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        else:
            self.products = xls.delete_duplicates_dict(self.products)
            self.products, self.no_urls = xls.separate_no_urls(self.products)
            self.products = xls._add_none_status(self.products)
            self.no_urls = xls._add_none_status(self.no_urls)
Ejemplo n.º 3
0
class SportmanSpider(CrawlSpider):
    name = "sportman"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(SportmanSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5, "Sportmann")

        if self.d["database"]:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d["catalog_id"], self.d["product_id"])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.start_urls = self.products["urls"]
        self.images_store = "/" + settings["IMAGES_STORE"]
        self.total = len(self.start_urls)

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = SportmanItem()
        if "redirect_urls" in response.request.meta:
            cur_url = response.request.meta["redirect_urls"][0]
        else:
            cur_url = response.url
        index = self.products["urls"].index(cur_url)
        try:
            if "redirect_urls" in response.request.meta:
                item["product_id"] = [self.products["product_ids"][index]]
                item["name"] = [self.products["names"][index]]
                item["in_stock"] = ["NOT_AVAILABLE"]
                self.exc.code_handler(102, response.url)
                self.xml.create_xml(item)
                self.products["status"][index] = "no_avail"
            else:
                item["name"], item["short_desc"], item["description"], item["old_price"], item["custom_price"], item[
                    "product_id"
                ], item["sku"] = self.get_basic_info(hxs)
                item["in_stock"] = ["IN_STOCK"]
                viewstate, eventval, prevpage, hidden, view_page, even_page, pre_page, hidd_page = self.get_vars(
                    response, hxs
                )

                viewstate1 = viewstate[:2000]
                viewstate2 = viewstate[2000:4000]
                viewstate3 = viewstate[4000:6000]
                viewstate4 = viewstate[6000:8000]
                viewstate5 = viewstate[8000:10000]
                viewstate6 = viewstate[10000:]

                item["viewstate1"] = [basic.cdata(viewstate1)]
                item["viewstate2"] = [basic.cdata(viewstate2)]
                item["viewstate3"] = [basic.cdata(viewstate3)]
                item["viewstate4"] = [basic.cdata(viewstate4)]
                item["viewstate5"] = [basic.cdata(viewstate5)]
                item["viewstate6"] = [basic.cdata(viewstate6)]
                item["eventval"] = [basic.cdata(eventval)]
                item["size_options"] = self.get_variants(hxs, response)

                images_url = self.get_images(hxs)

                item["normal_image_url"] = self.get_server_path(images_url)

                self.xml.create_xml(item)
                item.clear()
                item["image_urls"] = self.get_images(hxs)
                self.products["status"][index] = "ran"
        except:
            self.exc.code_handler(100, response.url)
            self.products["status"][index] = "error"
        else:
            return item

    def get_basic_info(self, hxs):
        name = hxs.select('//div[@id="fragment-1"]/h2/text()').extract()

        short_desc = hxs.select('//div[@class="description2"]/text()').extract()

        description = hxs.select('//div[@id="fragment-1"]/div[@class="description"]').extract()
        description = sportman.delete_tags(re, description[0])
        description = [basic.cdata(description)]

        old_price = hxs.select('//span[@class="oldprice"]/text()').extract()
        if old_price != []:
            old_price = " ".join(old_price)
            old_price = old_price.split(":")
            old_price = old_price[1].replace("Kr", "")
            old_price = [old_price.replace(" ", "")]
        else:
            old_price = old_price

        price = hxs.select('//span[@class="nowprice"]/text()').extract()
        if price != []:
            price = " ".join(price)
            price = price.split(":")
            price = price[1].replace("Kr", "")
            price = [price.replace(" ", "")]
        else:
            price = hxs.select('//span[@class="normalprice"]/text()').extract()
            price = " ".join(price)
            price = price.split(":")
            price = price[1].replace("Kr", "")
            price = [price.replace(" ", "")]

        id = hxs.select('//div[@class="articlenumber"]').extract()
        id = " ".join(id)
        id = id.replace(u"\xa0", "")
        id = basic.get_middle_text(id, "Art.nr.", "</div>")
        sku = id
        id = [id[0]]

        return name, short_desc, description, old_price, price, id, sku

    def get_vars(self, response, hxs):
        headers1 = {
            "User-Agent": "Mozilla/5.0 (Windows NT 5.1; rv:13.0) Gecko/20100101 Firefox/13.0.1",
            "Host": "www.sportmann.no",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-us,en;q=0.5",
            "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7",
            "Connection": "keep-alive",
            "Referer": "/product.aspx?productid=613232",
            "Cookie": "ASP.NET_SessionId=lurvsvrn3jxsfd45cedmsv45; Besok=922884e3-e9cb-4b69-b8c8-215f3cc988a9; __utma=184084580.1353376623.1312483243.1312483243.1312483243.1; __utmb=184084580.9.10.1312483243; __utmc=184084580; __utmz=184084580.1312483243.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)",
        }

        page = hxs.select("//html").extract()
        page = " ".join(page)

        viewst = basic.get_middle_text(page, 'id="__VIEWSTATE" value="', '"')
        eventval = basic.get_middle_text(page, 'id="__EVENTVALIDATION" value="', '"')
        prevpage = [""]
        hidden_field = [""]

        r = requests.get(response.url, headers=headers1)

        page_one = r.content

        viewst_page = basic.get_middle_text(page_one, 'id="__VIEWSTATE" value="', '"')
        eventval_page = basic.get_middle_text(page_one, 'id="__EVENTVALIDATION" value="', '"')
        prevpage_page = basic.get_middle_text(page_one, 'id="__PREVIOUSPAGE" value="', '"')
        hidden_temp = page_one.split('id="__VIEWSTATE"')
        hidden_temp = hidden_temp[1].split('id="__PREVIOUSPAGE"')
        hidden_temp = hidden_temp[0].split("<script sr")

        val_x = len(hidden_temp) - 1

        hidden_temp = basic.get_middle_text(hidden_temp[val_x], 'c="', '"')
        hidden_temp_val = hidden_temp[0]
        hidden_temp_val = hidden_temp_val.replace("amp;", "")
        hidden_url = "http://www.sportmann.no" + hidden_temp_val

        request_hidden = urllib2.Request(hidden_url)
        response_hidden = urllib2.urlopen(request_hidden)
        hidden_field_page = basic.get_middle_text(
            response_hidden.read(), "ctl00_ScriptManager1_HiddenField').value += '", "';"
        )

        return (
            viewst[0],
            eventval[0],
            prevpage[0],
            hidden_field[0],
            viewst_page[0],
            eventval_page[0],
            prevpage_page[0],
            hidden_field_page[0],
        )

    def get_variants(self, hxs, response):
        page = hxs.select("//html").extract()
        page = " ".join(page)
        dict_one = {}
        test_one = []

        temp = page.split('<div class="color">')
        temp = temp[1].split("</div>")
        temp = temp[0].split("<select name")

        viewstate, eventvalidation, previouspage, hiddenfield, view_page, even_page, pre_page, hidd_page = self.get_vars(
            response, hxs
        )

        if len(temp) == 1:
            color = hxs.select('//div[@class="color"]/text()').extract()
            value = hxs.select('//input[@id="ctl00_ContentPlaceHolder1_Variant1Hidden"]/@value').extract()
            color[0] = color[0].replace("  ", "")
            color = basic.clean_string(color[0])
            value = value[0]

        #            color = basic.clean_string(color[0])
        #            color = color.replace("  ","")
        #
        #            dict['color'] = color
        #            dict['color_value'] = value[0]

        else:
            test_color = basic.get_middle_text(temp[1], "farge</option>", "</select>")
            color = basic.get_middle_text(test_color[0], '">', "</option>")
            value = basic.get_middle_text(test_color[0], 'value="', '">')

            for i in range(0, len(color)):
                color[i] = color[i].replace("  ", "")
            #
            #                dict['color'] = color
            #                dict['color_value'] = value

        size_temp = page.split('<div class="size">')
        size_temp = size_temp[1].split("</div>")
        size_temp = size_temp[0].split("<select name")

        if len(size_temp) == 1:
            size = hxs.select('//div[@class="size"]/text()').extract()
            size = basic.clean_string(size[0])
            size = [size.replace("   ", "")]

            size_val = hxs.select('//input[@id="ctl00_ContentPlaceHolder1_Variant2Hidden"]/@value').extract()

            if size[0] == "":
                for i in range(len(value)):
                    resp_page = self.get_data(response, hidd_page, view_page, pre_page, even_page, value[i])

                    a_page = resp_page.split('<div class="siz')
                    a_page = a_page[1].split("</select>")

                    if len(a_page) == 1:

                        size = basic.get_middle_text(a_page[0], 'e">', '<input type="hidden"')
                        size_val = basic.get_middle_text(a_page[0], 'value="', '"')
                        size_val = size_val[0]
                        size_val = [size_val]

                    else:
                        a_page = basic.get_middle_text(a_page[0], "se</option>", "</select>")
                        size = basic.get_middle_text(a_page[0], '">', "</option>")
                        size_val = basic.get_middle_text(a_page[0], 'value="', '">')

                    dict_one["color"] = color[i]
                    dict_one["color_value"] = value[i]
                    dict_one["size_value"] = size_val

                    for x in range(0, len(size)):
                        size[x] = basic.clean_string(size[x])
                        size[x] = size[x].replace("   ", "")

                        dict_one["size"] = size

                    test_one.append(basic.cdata(json.dumps(dict_one)))

            else:
                dict_one["color"] = color

                dict_one["color_value"] = value
                dict_one["size"] = size
                dict_one["size_value"] = size_val
                test_one.append(basic.cdata(simplejson.dumps(dict_one)))

        else:
            test_size = basic.get_middle_text(size_temp[1], "se</option>", "</select>")
            size = basic.get_middle_text(test_size[0], '">', "</option>")
            size_val = basic.get_middle_text(test_size[0], 'value="', '">')

            for x in range(0, len(size)):
                size[x] = basic.clean_string(size[x])
                size[x] = size[x].replace("   ", "")

            dict_one["color"] = color
            dict_one["color_value"] = value
            dict_one["size"] = size
            dict_one["size_value"] = size_val

            test_one.append(basic.cdata(json.dumps(dict_one)))

        return test_one

    def get_server_path(self, url):
        images_array = []
        for i in range(0, len(url)):
            url[i] = basic.clean_string(url[i])

            images_array.append(self.images_store + "/full/" + hashlib.sha1(url[i]).hexdigest() + ".jpg")

        return images_array

    def get_images(self, hxs):
        page = hxs.select("//html").extract()
        page = " ".join(page)

        images = []

        temp = page.split('class="gallery_demo_unstyled"')
        temp = temp[1].split('<div class="right_container">')
        temp = basic.get_middle_text(temp[0], 'src="', '"')

        for i in range(0, len(temp)):
            image_url = "http://www.sportmann.no" + temp[i]
            images.append(image_url)

        return images

    def get_data(self, response, hidden, viewstate, previouspage, eventvalidation, colorvalue):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",
            "Host": "www.sportmann.no",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-us,en;q=0.5",
            "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7",
            "Connection": "keep-alive",
            "Referer": "http://www.sportmann.no/product.aspx?productid=613232",
            "Cookie": "",
        }

        eventvalidation = urllib.urlencode({"__EVENTVALIDATION": eventvalidation})
        viewstate = urllib.urlencode({"__VIEWSTATE": viewstate})
        previouspage = urllib.urlencode({"__PREVIOUSPAGE": previouspage})
        hidden = urllib.urlencode({"ctl00_ScriptManager1_HiddenField": hidden})

        data = (
            "ctl00%24ScriptManager1=ctl00%24ContentPlaceHolder1%24dropdownPanel%7Cctl00%24ContentPlaceHolder1%24ddlVariant&"
            + hidden
            + "%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0&__EVENTTARGET=ctl00%24ContentPlaceHolder1%24ddlVariant&__EVENTARGUMENT=&__LASTFOCUS=&"
            + viewstate
            + "&"
            + previouspage
            + "&"
            + eventvalidation
            + "&ctl00%24ProductSearch%24txtProdSearch=&ctl00%24ProductSearch%24TextBoxWatermarkProdSearch_ClientState=&ctl00%24ContentPlaceHolder1%24ddlVariant="
            + colorvalue
            + "&ctl00%24ContentPlaceHolder1%24Variant1Hidden=&ctl00%24ContentPlaceHolder1%24Variant2Hidden=&ctl00%24ContentPlaceHolder1%24tbAmount=1&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtFriendsName=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceFriendsName_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtFriendsEmail=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceFriendsEmail_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtYourName=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceYourName_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtYourEmail=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceYourEmail_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtComment=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceComment_ClientState=&__ASYNCPOST=true&"
        )

        # r = requests.get(response.url, h)
        req = urllib2.Request(response.url, data, headers)

        resp_page = urllib2.urlopen(req).read()

        return resp_page

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}".format(datetime.now())
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total)
        # filename for writing xml
        if self.d["database"]:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d["catalog_id"])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d["file"]
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        if self.d["upload"]:
            exp = CommonExport()
            try:
                exp.xml_to_db(self.name, filename, "1ccd39a5-af4e-47cc-aebe-e0dede5b14d8")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        from modules.mail import Mail

        mail = Mail()
        try:
            mail.send_mail(msg, "Sportmann: {0}".format(filename))
            if self.d["email"]:
                mail.send_mail(msg, "Sportmann: {0}".format(filename), self.d["email"])
        except:
            msg += "\nSending mail failed."
        if self.d["database"]:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), "w") as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d["file"]))
        self.products = dict()
        try:
            self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15)
            self.products["product_ids"] = xls.read_excel_collumn_for_ids(1, 15)
            self.products["names"] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d["file"])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d["file"])
            self.exc.code_handler(103, msg=msg)
        self.products = xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

    def add_properties(self, xml):
        xml.add_property("short_desc", "Short Description", "text")
        xml.add_property("old_price", "Old Price", "text")
        xml.add_property("custom_price", "New Price", "text")
        xml.add_property("color_value", "Color Value", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("size_val", "Size Value", "text_list")
        xml.add_property("sku", "Sku", "text")
        xml.add_property("size_options", "Size_options", "text_list")
        xml.add_property("viewstate1", "Viewstate1", "text_list")
        xml.add_property("viewstate2", "Viewstate2", "text_list")
        xml.add_property("viewstate3", "Viewstate3", "text_list")
        xml.add_property("viewstate4", "Viewstate4", "text_list")
        xml.add_property("viewstate5", "Viewstate5", "text_list")
        xml.add_property("viewstate6", "Viewstate6", "text_list")
        xml.add_property("eventval", "Eventval", "text_list")
        xml.add_property("hidden", "Hidden Field", "text_list")
        xml.add_property("prevpage", "Previous Page", "text_list")
        xml.add_property("recommended_product", "Recommended Product", "text_list")
Ejemplo n.º 4
0
class BurtonSpider(CrawlSpider):
    name = "burton"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(BurtonSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5, "Burton")
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(
                self.d['catalog_id'], self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.handle_not_provided()
        burton.add_properties(self.xml)
        self.start_urls = self.products['urls']
        self.start_urls = [
            "http://www.dickssportinggoods.com/product/index.jsp?productId=13243074"
        ]
        self.images_store = "/" + settings['IMAGES_STORE']
        self.total = len(self.start_urls)

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = BurtonItem()
        page = hxs.extract()
        if 'redirect_urls' in response.request.meta:
            cur_url = response.request.meta['redirect_urls'][0]
        else:
            cur_url = response.url
        index = self.products['urls'].index(cur_url)
        try:
            if 'redirect_urls' in response.request.meta:
                item['product_id'] = [self.products['product_ids'][index]]
                item['name'] = [self.products['names'][index]]
                item['in_stock'] = ["NOT_AVAILABLE"]
                self.exc.code_handler(102, response.url)
                self.xml.create_xml(item)
                self.products["status"][index] = "no_avail"
            else:
                item['product_id'], item['name'] = self.get_basic_info(hxs)
                item['description'], item['features'] = self.get_description(
                    hxs)
                item['variants'], thumb_urls, color_names = self.get_variants(
                    page)
                item['all_sizes'] = self.get_all_sizes(page)
                item['color_json'], image_urls = self.get_colors(
                    page, color_names)
                item['price'], item['old_price'] = self.get_prices(hxs)
                item['in_stock'] = ['IN_STOCK']
                item['product_link'] = [basic.cdata(response.url)]
                self.xml.create_xml(item)
                item['image_urls'] = image_urls + thumb_urls
                self.products["status"][index] = "ran"
        except:
            self.exc.code_handler(100, response.url)
            self.products["status"][index] = "error"
        else:
            return item

    def handle_not_provided(self):
        item = BurtonItem()
        for n in self.no_urls['product_ids']:
            item['product_id'] = [n]
            index = self.no_urls['product_ids'].index(n)
            item['name'] = [self.no_urls['names'][index]]
            item['in_stock'] = ['NOT_AVAILABLE']
            self.xml.create_xml(item)

    def get_basic_info(self, hxs):
        name = hxs.select('//h1[@class="productHeading"]/text()').extract()
        product_id = hxs.select('//input[@name="productId"]/@value').extract()
        return product_id, name

    def get_server_path(self, url):
        path = self.images_store + "/full/" + hashlib.sha1(
            url).hexdigest() + ".jpg"
        return path

    def get_prices(self, hxs):
        price = hxs.select('//div[@class="op"]/text()').extract()
        price = [basic.get_price(price[0])]
        old_price = hxs.select('//span[@class="lp"]/text()').extract()
        if old_price:
            old_price = [basic.get_price(old_price[0])]
        return price, old_price

    def get_description(self, hxs):
        description = hxs.select(
            '//div[@id="FieldsetProductInfo"]/text()').extract()[3]
        features = hxs.select('//div[@id="FieldsetProductInfo"]/ul').extract()
        if features:
            features = [features[0][:2000]]
        return [basic.cdata(description)], basic.cdata_field(features)

    def get_variants(self, page):
        """Gets jsons for colors with all available sizes.
        In json are also fetched all information for sizes that are on the site
        """
        script = basic.get_middle_text(page,
                                       'var skuSizeColorObj = new Array();',
                                       '</script>')[0]
        sizes = []
        image_urls = []
        color_names = []
        colors = script.split('skuSizeColorObj')
        for c in range(1, len(colors)):
            temp = basic.get_middle_text(colors[c], '= ', ';')
            # delete swatch image as it obviously won't be needed
            t = simplejson.loads(burton.replace_for_json(temp[0]))
            image_urls.append(t['swatchURL'])
            color_names.append(t['ColorDesc'])
            t['swatchURL'] = self.get_server_path(t['swatchURL'])
            sizes.append(basic.cdata(simplejson.dumps(t)))
        return sizes, image_urls, color_names

    def get_all_sizes(self, page):
        script = basic.get_middle_text(page, 'var distsizeobj=new Array();',
                                       'var indexcolor=0;')[0]
        all_sizes = basic.get_middle_text(script, ']="', '";')
        return [basic.cdata(simplejson.dumps(all_sizes))]

    def get_colors(self, page, color_names):
        """Gets color information with images from javascript on the page.
        Returns  json with color name and imagself.images_store = "/" + settings['IMAGES_STORE']e url for that color, and
        returnes filed of image urls that can be used for download later"""
        script = basic.get_middle_text(page, 'var imageMap_0 = new Array();',
                                       '</script>')[0]
        colors = basic.get_middle_text(script, '] = ', ';')
        image_urls = []
        colors_json = []
        for i in range(0, len(color_names)):
            color = burton.replace_color_json(colors[i])
            color = simplejson.loads(color)
            color['cname'] = color_names[i]
            color.pop('reg')
            image_urls.append(color['enh'])
            color['enh'] = self.get_server_path(color['enh'])
            colors_json.append(basic.cdata(simplejson.dumps(color)))
        return colors_json, image_urls

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}".format(datetime.now())
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter,
                                                         self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        if self.d['upload']:
            exp = CommonExport()
            try:
                exp.xml_to_db(self.name, filename,
                              "4ea95a81-90fb-49e2-837e-acf5ab58f574")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        # part for exporting to database here
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "Burton: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "Burton: {0}".format(filename),
                               self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15)
            self.products["product_ids"] = xls.read_excel_collumn_for_ids(
                1, 15)
            self.products["names"] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        self.products = xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)
Ejemplo n.º 5
0
class GuitarCenterSpider(CrawlSpider):
    name = "guitar_center"
    allowed_domains = ["musiciansfriend.com"]
    start_urls = ["http://www.musiciansfriend.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(GuitarCenterSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5)
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                        self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.handle_not_provided()
        self.start_urls = self.products['urls']
        self.total = len(self.products['urls'])

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = GuitarCenterItem()
        from scrapy.conf import settings
        if 'redirect_urls' in response.request.meta:
            cur_url = response.request.meta['redirect_urls'][0]
        else:
            cur_url = response.url
        index = self.products['urls'].index(cur_url)
        try:
            item['product_id'] = [self.products['product_ids'][index]]
            item['name'], item['brand'] = self.get_basic_info(hxs)
            item['heading'], item['details'], item['specs'], item['call_to_action'] = self.get_description(hxs)
            item['brand_image'], item['brand_image_promo'], brand_images = self.get_description_images(hxs)
            item['old_price'], item['discount'], item['price'] = self.get_prices(hxs)
            item['image_json'], img = self.get_images(hxs)
            item['serial'] = self.get_serials(hxs)
            item['warranty'] = self.gold_coverage(hxs)
            item['in_stock'] = self.get_available(hxs)
            item['product_ref'], item['add_to_cart_id'] = self.get_add_to_cart(hxs)
            if not item['add_to_cart_id']:
                item['in_stock'] = ["NOT_AVAILABLE"]
            item['shipping'] = self.get_shipping(hxs)
            item['colors'] = self.get_colors(hxs)
            self.products['status'][index] = "ran"
        except StandardError:
            self.products['status'][index] = "error"
            self.exc.code_handler(100, response.url)
        else:
            self.xml.create_xml(item)
            item['image_urls'] = img + brand_images
        return item

    def handle_not_provided(self):
        item = GuitarCenterItem()
        for n in self.no_urls['product_ids']:
            item['product_id'] = [n]
            index = self.no_urls['product_ids'].index(n)
            item['name'] = [self.no_urls['names'][index]]
            item['in_stock'] = ['NOT_AVAILABLE']
            self.xml.create_xml(item)

    def get_basic_info(self, hxs):
        name = hxs.select('//h1[@class="fn"]/text()').extract()
        name = [basic.clean_string("".join(name))]
        brand = hxs.select('//span[@class="brand"]/text()').extract()
        name = [name[0].replace(u"\xa0", "")]
        return name, brand

    def get_description_images(self, hxs):
        brand_image = hxs.select('//a[@class="brandImage"]/img/@src').extract()
        brand_image_promo = hxs.select('//div[@class="brandPromoLogo"]/img/@src').extract()
        images = brand_image + brand_image_promo
        if brand_image:
            brand_image = [self.get_server_path(brand_image[0])]
        if brand_image_promo:
            brand_image_promo = [self.get_server_path(brand_image_promo[0])]
        return brand_image, brand_image_promo, images

    def get_description(self, hxs):
        heading = hxs.select('//div[@id="description"]/p').extract()
        details = hxs.select('//p[@class="description"]').extract()
        specs = hxs.select('//div[@class="specs"]/ul').extract()
        last = hxs.select('//div[@class="callToAction"]/p/text()').extract()
        return basic.cdata_field(heading), basic.cdata_field(details), basic.cdata_field(specs), basic.cdata_field(last)

    #function for getting prices, returns tags and values or empty field if no option for one of them new is discount
    def get_prices(self, hxs):
        tag = hxs.select('//dl[@class="lineItemList"]/dt/text()').extract()
        value = hxs.select('//dl[@class="lineItemList"]/dd/text()').extract()
        old_price = []
        discount = []
        price = []
        if len(tag) > 1:
            old_price = [basic.clean_string(value[0])]
        try:
            discount = [basic.clean_string(value[len(value) - 1])]
        except IndexError:
            print "This product has no price."
        try:
            price = hxs.select('//span[@class="topAlignedPrice"]/text()').extract()
        except IndexError:
            print "This product has no price."
        if not old_price and not discount and not price:
            price = hxs.select('//dl[@class="inlineList"]/dd/text()').extract()
        return self.clean_price(old_price), self.clean_price(discount), self.clean_price(price)

    # returning json with image url and serial number of product image refers to
    def get_images(self, hxs):
        images = hxs.select('//ul[@id="prodDetailThumbs"]/li/a/@href').extract()
        tags = hxs.select('//ul[@id="prodDetailThumbs"]/li/@class').extract()
        images_list = []
        d = {}
        img = []
        for i in range(0, len(images)):
            d['image_url'] = self.get_server_path(images[i])
            img.append(images[i])
            if "site1sku" in tags[i]:
                d['product_serial'] = tags[i].replace("site1sku", "")
            else:
                d['product_serial'] = tags[i]
            images_list.append(basic.cdata(simplejson.dumps(d)))
        return images_list, img

    # function for getting serials and all information about them, currently returns field with jsons with all
    # information, can be modified to return dicts if needed for subproducts for those one day
    def get_serials(self, hxs):
        serials = hxs.select('//var[@class="hidden styleInfo"]/text()').extract()
        new = []
        for serial in serials:
            d = simplejson.loads(serial)
            new.append(basic.cdata(simplejson.dumps(d)))
        return new

    def get_server_path(self, url):
        #uncomment next line if you want to keep absolute image path from their site
        return url
        return IMAGES_STORE + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg"

    # function for getting gold coverage from the page which is actually additional warranty options
    def gold_coverage(self, hxs):
        ids = hxs.select('//div[@class="goldCoverage"]/input[@type="checkbox"]/@value').extract()
        labels = hxs.select('//div[@class="goldCoverage"]/label/text()').extract()
        d = {}
        new = []
        for i in range(0, len(ids)):
            d['id'] = ids[i]
            d['name'] = labels[i]
            new.append(basic.cdata(simplejson.dumps(d)))
        return new

    # function for getting availability
    def get_available(self, hxs):
        p = hxs.select('//var[@class="hidden availability"]/text()').extract()
        if p:
            if p[0] == "in_stock":
                p = [p[0].upper()]
        else:
            #for those that have color options and in stock status for each of those
            #put IN_STOCK for the product as it has no that option on the page
            p = ["IN_STOCK"]
        return p

    # function for getting add to cart id and product reference
    def get_add_to_cart(self, hxs):
        try:
            temp = hxs.select('//span[@class="magicLink addToList"]/@data-rel').extract()[0]
        except:
            print "Product not available"
        else:
            return [temp.split("|")[0]], [temp.split("|")[1]]
        return [], []

    # function for gatting shipping information
    def get_shipping(self, hxs):
        return hxs.select('//div[@id="targeter_pdpShipping"]/span/text()').extract()

    # function for getting colors, return jsons with all the data about options
    def get_colors(self, hxs):
        colors = hxs.select('//var[@class="styleInfo"]/text()').extract()
        new = []
        for color in colors:
            d = simplejson.loads(color)
            new.append(basic.cdata(simplejson.dumps(d)))
        return new

    # cleaning price to leave only numbers
    def clean_price(self, price):
        new = []
        for i in price:
            new.append(re.sub('[^0-9.]', '', i))
        return new

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}".format(datetime.now())
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        if self.d['upload']:
            exp = CommonExport()
            try:
                exp.xml_to_db(self.name, filename, "4a9f5955-9b8e-4e13-84ef-95f937dbc00d")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        ## part for exporting to database here
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "GuitarCenter: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "GuitarCenter: {0}".format(filename), self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def add_properties(self, xml):
        xml.add_property("old_price", "Old Price", "decimal")
        xml.add_property("image_json", "Image Json", "text_list")
        xml.add_property("discount", "Discount", "decimal")
        xml.add_property("product_ref", "Product Ref.", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("serial", "Serial", "text_list")
        xml.add_property("colors", "Colors", "text_list")
        xml.add_property("add_to_cart_id", "Add To Cart ID", "text")
        xml.add_property("shipping", "Shipping", "text")
        xml.add_property("warranty", "Warranty", "text_list")
        xml.add_property("heading", "Heading", "text")
        xml.add_property("details", "Details", "text")
        xml.add_property("specs", "Specs", "text")
        xml.add_property("call_to_action", "Call To Action", "text")
        xml.add_property("brand_image", "Brand Image", "text")
        xml.add_property("brand_image_promo", "Brand Image Promo", "text")

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15)
            self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15)
            self.products['names'] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        self.products= xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)
Ejemplo n.º 6
0
class ChomeSpider(CrawlSpider):
    name = "chome"
    allowed_domains = ["zmags.com"]
    start_urls = ["http://www.zmags.com/"]
    counter = 0

    def __init__(self, *a, **kw):
        super(ChomeSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5)
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                        self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.images_store = "/" + settings['IMAGES_STORE']
        self.total = len(self.no_urls['product_ids'])

    def parse(self, response):
        self.counter += 1
        hxs = HtmlXPathSelector(response)
        item = ChomeItem()
        print "IDs in excel feed: {0}".format(self.total)
        item['image_urls'] = self.parse_whole_xml()
        return item

    def parse_whole_xml(self):
        xml_dir = "xml/{0}".format(self.name)
        file_url = "https://svc.celebratinghome.com/ZMags.svc/ProductInfo1"
        downloader = Downloader()
        if self.d['download']:
            downloader.get_file(xml_dir, file_url, "client_feed")
        else:
            if not os.path.exists('xml/{0}/client_feed.xml'.format(self.name)):
                basic.warning("Feed file doesn't exist please de-select no download option")
                os._exit(2)
        self.number = 0
        xml_item = ChomeItem()
        urls_all = []
        for event, elem in iterparse('xml/{0}/client_feed.xml'.format(self.name)):
            if elem.tag == "{http://schemas.microsoft.com/ado/2007/08/dataservices/metadata}properties":
                for r in elem:
                    p = "{http://schemas.microsoft.com/ado/2007/08/dataservices}"
                    if r.tag == p + "Id" and r.text in self.no_urls['product_ids']:
                        index = self.no_urls['product_ids'].index(r.text)
                        self.no_urls['status'][index] = 'ran'
                        self.number += 1
                        urls = []
                        flag = 0
                        for x in elem:
                            if x.tag == p + "Id":
                                xml_item['product_id'] = [x.text]
                            elif x.tag == p + "EngLongDesc" and x.text is not None:
                                xml_item['description_english'] = [self.escape(basic.cdata(x.text))]
                            elif x.tag == p + "RetailPrice":
                                xml_item['custom_price'] = [x.text[:-2]]
                            elif x.tag == p + "SpnLongDesc" and x.text is not None:
                                xml_item['description_spanish'] = [self.escape(basic.cdata(x.text))]
                            elif x.tag == p + "PartNumber":
                                xml_item['add_to_cart_id'] = [x.text]
                            elif x.tag == p + "MaxQty":
                                xml_item['max_qty'] = [x.text]
                            elif x.tag == p + "TimeType":
                                xml_item['time_type'] = [x.text]
                            elif x.tag == p + "SpnName" and x.text is not None:
                                xml_item['name_spanish'] = [x.text]
                            elif x.tag == p + "EngName":
                                xml_item['name_english'] = [x.text]
                            elif x.tag == p + "ImagePath_Large" and x.text is not None:
                                urls.append(self.get_absolute(x.text))
                                xml_item['normal_image_url'] = [self.get_server_path(self.get_absolute(x.text))]
                            elif x.tag == p + "IsActive":
                                if x.text == 0:
                                    xml_item['in_stock'] = ["NOT_IN_STOCK"]
                                else:
                                    xml_item['in_stock'] = ['IN_STOCK']
                            else:
                                for i in range(1, 4):
                                    tag = p + "Alternate%sImagePath_Large" % (str(i))
                                    if x.tag == tag and x.text is not None:
                                        urls.append(self.get_absolute(x.text))
                                        xml_item['normal_image_url'].append(self.get_server_path(self.get_absolute(x.text)))
                                        # change image paths for normal_image_url and return urls
                        self.xml.create_xml(xml_item)
                        urls_all += urls
        for i in range(0, len(self.no_urls['status'])):
            if self.no_urls['status'][i] != 'ran':
                self.no_urls['status'][i] = 'not_found'
        return urls_all

    def get_server_path(self, url):
        path = self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg"
        return path

    def get_absolute(self, url):
        return "http://www.celebratinghome.com/" + url

    def escape(self, string):
        temp = HTMLParser.HTMLParser().unescape(string)
        return HTMLParser.HTMLParser().unescape(temp)

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}\n".format(datetime.now())
        if self.total - self.number:
            msg += "{0} id(s) from id list weren't found in feed".format(self.total - self.number)
            basic.warning(msg)
        else:
            msg += "All ids found in feed."
            basic.green(msg)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.no_urls)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        #if self.d['upload']:
            #exp = CommonExport()
            #try:
                #exp.xml_to_db(self.name, self.d['file'], "40b029c9-dff7-4bc1-b8bc-ef062960b24d")
                #msg += "\n\nExport to database successful"
            #except StandardError:
                #msg += "\n\nExport to database failed"
        #else:
            #msg += "\n\nUpload to database not selected"
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "CelebratingHome: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "CelebratingHome: {0}".format(filename), self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15)
            self.products['names'] = xls.read_excel_collumn(2, 15)
            self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        self.products= xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

    def add_properties(self, xml):
        xml.add_property("description_english", "Description English", "text")
        xml.add_property("description_spanish", "Description Spanish", "text")
        xml.add_property("add_to_cart_id", "Add To Cart ID", "text")
        xml.add_property("max_qty", "Max Quantity", "text")
        xml.add_property("time_type", "Time Type", "text")
        xml.add_property("name_english", "Name English", "text")
        xml.add_property("name_spanish", "Name Spanish", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("custom_price", "Custom Price", "text")
Ejemplo n.º 7
0
class PartyliteSpider(CrawlSpider):
    name = "partylite"
    allowed_domains = ["partylite.biz"]
    start_urls = ["http://www.zmags.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(PartyliteSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = PartyliteTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.images_store = "/" + settings['IMAGES_STORE']
        self.users = party.get_users(settings, self.d)
        self.exc = ZmagsException(50)
        self.production = self.d['env']
        self.upload = self.d['upload']
        self.english = self.d['lang']
        self.file_name = self.d['file']
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                        self.d['product_id'])
            self.database.disconnect()
            self.change_url_list()
        else:
            self.get_lists_from_excel()
        self.xml = CommonXml()
        party.add_properties(self.xml)
        self.total = len(self.products['urls'])

    def parse(self, response):

        for url in self.products['urls']:

            if self.d['lang'] == 'us':
                request = Request(url, callback=self.parse_can, dont_filter=True)
                yield request

            elif self.d['lang'] == 'english':
                c_url = url.replace(self.users['us'], self.users['canada_en'])
                request = Request(c_url, callback=self.parse_can, dont_filter=True)
                request.meta['language'] = "eng"
                yield request

            elif self.d['lang'] == 'french':
                c_url = url.replace(self.users['us'], self.users['canada_fr'])
                request = Request(c_url, callback=self.parse_can, dont_filter=True)
                request.meta['language'] = "fr"
                yield request

    def change_url_list(self):
        for i in range(0, len(self.products['urls'])):
            if not self.production:
                self.products['urls'][i] = self.products['urls'][i].replace('www', 'qa')
            self.products['urls'][i] = self.products['urls'][i].replace('XXXXX', self.users['us'])

    def get_in_stock(self, hxs):
        """Gets in stock information about product."""
        stock = hxs.select('//div[@id="availability_container"]').extract()
        if not stock:
            return ["IN_STOCK"]
        else:
            return ["NOT_IN_STOCK"]

    def get_basic_info(self, hxs):
        """Getting basic info about products (name, shown with)."""
        name = hxs.select('//div[@id="product_name"]/text()').extract()
        if name:
            name = basic.cdata_field(name)
        shown_with = hxs.select('//div[@id="shown_with_container"]').extract()
        if shown_with:
            shown_with = [basic.cdata(shown_with[0])]
        return name, shown_with

    def get_description(self, hxs):
        description = description = hxs.select('//div[@id="item_description"]').extract()
        description = [basic.cdata(basic.remove_tags(description[0]))]
        description = [description[0].replace(u"\u2044", "/")]
        return description

    def get_price(self, hxs):
        """Getting product prices.
        Gets regular and discount price if there is one."""
        price = hxs.select('//span[@id="divUnitPrice"]/text()').extract()
        if not price:
            price = hxs.select('//div[@id="product_price"]/span[1]/text()').extract()
        if not price:
            price = hxs.select('//div[@id="product_price"]/text()').extract()
        discount = hxs.select('//div[@id="product_price"]/span[@class="pc-salePrice"]/text()').extract()
        price = basic.clean_string(price[0])
        price = re.sub(" +", " ", price)
        price = price.replace("Price:", "")
        price = price.replace("Prix:", "")
        price = basic.cdata(price.strip())
        if discount:
            discount = basic.cdata_field(discount)
        return [price], discount

    def get_add_to_cart_id(self, page):
        """Gets add to cart id from the javascript on the page."""
        tmp = basic.get_middle_text(page, "if(isOrderStarted){", "}else")[0]
        tmp = basic.get_middle_text(tmp, "addItemToCart(", ",")
        return tmp

    def create_subproducts(self, page):
        """Gets information about colors from javascript.
        Returns field of dicts with information about colors.
        Those are really color variants for product."""
        try:
            tmp = page.split("var largeImages = new Array();")[1]
        except IndexError:
            print "This product has no images"
        else:
            tmp = tmp.split("colorDropdownArray")[0]
            images = basic.get_middle_text(tmp, "ProductGroupProduct(", ");")
            image_names = self.get_image_names(page)
            color_products = []
            for im in images:
                product = {}
                attributes = im.split("',")
                product['normal_image_url'] = "http://qa.partylite.biz/imaging/resize?fileName=/productcatalog/production"
                product['normal_image_url'] += self.custom_clean_string(attributes[26], True)
                product['description'] = basic.cdata(self.custom_clean_string(attributes[27]))
                product['color_id'] = self.custom_clean_string(attributes[7], True)
                product['swatch_color'] = basic.cdata(self.custom_clean_string(attributes[9]).replace(" ", ""))
                product['name'] = basic.cdata(image_names[product['color_id']])
                product['add_to_cart_id'] = self.custom_clean_string(attributes[0], True).replace(" ", "")
                product['price'] = self.custom_clean_string(attributes[10], True)
                color_products.append(product)
            return color_products
        return []

    def custom_clean_string(self, string, spaces=False):
        """Custom function for cleaning strings.
        Replaces new line, return and tab signs, also replaces multiple spaces with only one."""
        string = string.replace("\r", "")
        string = string.replace("\n", "")
        string = string.replace("\t", "")
        if not spaces:
            string = re.sub(' +', ' ', string)
        else:
            string = re.sub(' ', '', string)
        string = string.replace("'", "")
        return string

    def get_image_names(self, page):
        """Gets color names for color swatches."""
        temp = page.split("new DropDownInfo")
        names = {}
        for i in range(1, len(temp)):
            names[basic.get_middle_text(temp[i], "('", "'")[0]] = basic.get_middle_text(temp[i], "'", "')")[2]
        return  names

    def get_recommended(self, hxs):
        """Gets recommended product information.
        Returns information about recommended products as dict"""
        rec = hxs.select('//div[@id="right_column_container"]/div')
        new = []
        i = 0
        for r in rec:
            d = {}
            #to do: see how to get full href(different accounts)
            if not i:
                d['link'] = r.select('div/a/@href').extract()[0]
                d['image'] = "http://www.partylite.biz/imaging/resize"
                d['image'] += r.select('div/a/img/@src').extract()[0]
                d['name'] = r.select('div/a/text()').extract()[0]
                new.append(basic.cdata(simplejson.dumps(d)))
            i += 1
        return  new

    def get_reviews(self, page):
        """Gets average product rating.
        Returns string like 4.6 of 5 reviews."""
        id = self.get_review_id(page)
        url = "http://partylite.ugc.bazaarvoice.com/8504-en_us/" + id + "/reviews.djs?format=embeddedhtml"
        url = url.replace(" ", "")
        page = urllib2.urlopen(url).read()
        page = basic.get_middle_text(page, '<div class=\\"BVRRRatingNormalImage\\">', '<\/div>')
        if page:
            rating = basic.get_middle_text(page[0], 'alt=\\"', '\\"')[0]
            return [rating]
        else:
            return []

    def get_more_images(self, page):
        """Gets field of images."""
        try:
            script = basic.get_middle_text(page, "var moreImages", "var numberOfImages")[0]
        except IndexError:
            print "This product has no images."
        else:
            r = basic.get_middle_text(script, "moreImages[", "';")
            images = []
            # return cdata here if needed to go with absolute links
            for i in range(0, len(r)):
                if self.production:
                    images.append("http://www.partylite.biz" + r[i].split("= '")[1])
                else:
                    images.append("http://qa.partylite.biz" + r[i].split("= '")[1])
            return images
        return []

    def get_absolute(self, relatives):
        """Creates absolute path for images. [DEPRECATED]
        Please check if there is a need for this function again.
        If needed dimensions of images got from the client server
        can be changed here."""
        new = []
        print relatives
        os._exit(0)
        for i in range(0, len(relatives)):
            #add width, height here for different dimensions
            #don't change the url in here from qa to www it's meant to be qa always
            new.append("http://www.partylite.biz/imaging/resize?fileName=/productcatalog/production" + relatives[i])
        return new

    def get_review_id(self, page):
        """Gets review id that is used in javascript for reviews."""
        return basic.get_middle_text(page, 'productId: "', '"')[0]

    def write_subproducts(self, id, list, xml):
        """Writes child products to xml.
        Receives id, list and xml attributes, id is master product id,
        list is list of child products and xml is Xml instance"""
        for i in range(0, len(list)):
            item = PartyliteItem()
            item['master_product_id'] = id
            item['product_id'] = [id[0] + "_" + str(i)]
            item['in_stock'] = ["IN_STOCK"]
            for k, v in list[i].iteritems():
                item[k] = [v]
            xml.create_xml(item)
        return 1

    def parse_can(self, response):
        """Parse function for scraping canadian sites.
        There is meta information send in request in this function about language."""
        self.counter += 1
        basic.print_status(self.counter, self.total)
        item = PartyliteItem()
        hxs = HtmlXPathSelector(response)
        image_urls = []
        if  'redirect_urls' in response.request.meta:
            item['product_id'] = [self.get_id(response.request.meta['redirect_urls'][0])[0]]
            self.exc.code_handler(102, response.request.meta['redirect_urls'])
            if 'language' in response.request.meta:
                item['product_id'] = [self.get_id(response.request.meta['redirect_urls'][0])[0]
                                      + "_can" + "_" + response.meta['language']]
            try:
                index = self.products['product_ids'].index(self.get_id
                                (response.request.meta['redirect_urls'][0])[0])
                item['name'] = [basic.cdata(item['product_id'][0]
                                + self.products['names'][index])]
                self.products['status'][index] = 'no_avail'
            except KeyError as e:
                print "This %s id is not in list" % (item['product_id'][0])
            item['in_stock'] = ['NOT_AVAILABLE']
            item['product_id'] = self.remove_spaces(item['product_id'])
            self.xml.create_xml(item)
        else:
            index = self.products['product_ids'].index(self.get_id(response.url)[0])
            try:
                item['product_id'] = self.get_id(response.url)
                item['name'], item['shown_with'] = self.get_basic_info(hxs)
                item['description'] = self.get_description(hxs)
                if 'language' in response.meta:
                    item['product_id'] = [item['product_id'][0] + "_can" + "_" + response.meta['language']]
                response.meta['item'] = item
                page = " ".join(hxs.select('//html').extract())
                image_urls = self.get_more_images(page)
                item['normal_image_url'] = self.get_server_path_field(image_urls)
                item['in_stock'] = self.get_in_stock(hxs)
                color_products = self.create_subproducts(page)
                if color_products:
                    self.write_subproducts(item['product_id'], color_products, xml)
                else:
                    item['add_to_cart_id'] = self.get_add_to_cart_id(page)
                    item['custom_price'], item['custom_discount'] = self.get_price(hxs)
                self.products['status'][index] = "ran"
            except StandardError:
                basic.print_error()
                self.products['status'][index] = "error"
                self.exc.code_handler(100, response.url)
            else:
                item['product_id'] = self.remove_spaces(item['product_id'])
                self.xml.create_xml(item)
        if image_urls:
            item['image_urls'] = image_urls
        return item

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = party.get_settings_message(self.d)
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        logname = filename
        filename = "{0}_{1}".format(filename, self.d['lang'])
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        from modules.mail import Mail
        from modules.export_to_db import CommonExport
        exp = CommonExport()
        if self.upload:
            try:
                if self.d['lang'] == 'us':
                    exp.xml_to_db(self.name, filename, "55892247-1b92-4ff9-a8a3-33cc976f9341")
                else:
                    exp.xml_to_db(self.name, filename, "9cb6c676-c14f-403b-b94f-b981184e1de0")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        mail = Mail()
        try:
            mail.send_mail(msg, "Partylite: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "Partylite: {0}".format(filename), self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = 'logs/{0}'.format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, logname), 'w') as f:
                f.write(msg)

    def get_id(self, url):
        """Gets id from product url."""
        return [url.split("&sku=")[1]]

    def get_server_path(self, url):
        """Gets server path for image url."""
        url = url.split("partylite.biz")[1]
        return self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg"

    def get_server_path_field(self, urls):
        """Getting server path for field of image urls."""
        new = []
        for url in urls:
            url = url.split("partylite.biz")[1]
            new.append(self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg")
        return new

    def remove_spaces(self, field):
        new = []
        for i in field:
            new.append(i.replace(' ', ''))
        return new

    def get_lists_from_excel(self):
        excel_path = "xls/{0}/{1}.xls".format(self.name, self.d['file'])
        xls = PartyliteExcel(path=excel_path, user=self.users['us'], production=self.production)
        self.products = dict()
        try:
            self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15)
            self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15)
            self.products['names'] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            exc.code_handler(103, msg=msg)
        self.products= xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)
Ejemplo n.º 8
0
class LydiasSpider(CrawlSpider):
    name = "lydias"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(LydiasSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = VariantsXml()
        self.exc = ZmagsException(5)
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(
                self.d['catalog_id'], self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        # fix for bug with links they provide
        self.products['urls'] = basic.cut_string_field(self.products['urls'],
                                                       "&cat=")
        self.handle_not_provided()
        self.start_urls = self.products['urls']
        self.images_store = "/" + settings['IMAGES_STORE']
        lydias.add_properties(self.xml)
        self.total = len(self.products['urls'])

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = LydiasItem()
        if 'redirect_urls' in response.request.meta:
            cur_url = response.request.meta['redirect_urls'][0]
        else:
            cur_url = response.url
        index = self.products['urls'].index(cur_url)
        id = self.products['product_ids'][index]
        try:
            available = hxs.select('//div[@id="searchfor"]/text()').extract()
            if not available:
                item['product_id'] = [id]
                item['name'], item['price'], item['old_price'], item[
                    'description'] = self.get_basic_info(hxs)
                item['rating'], item['custom_rating'] = self.get_rating(hxs)
                chart = self.absolute_path(self.get_size_image(hxs))
                item['sizes_chart_image_url'] = self.get_server_path(chart)
                color_urls, color_names, product_image, color_codes = self.get_image_swatches(
                    hxs)
                color_urls = self.absolute_path(color_urls)
                item['color_image_url'] = self.make_colors_json(
                    color_urls, color_names, color_codes)
                item['in_stock'] = ["IN_STOCK"]
                item['embroidery'] = self.get_embroidery(hxs)
                default_images = self.absolute_path(self.get_extra_images(hxs))
                item['default_image_url'] = self.get_server_path(
                    default_images)
                self.xml.create_xml(item)
                product_image = self.absolute_path(product_image)
                self.create_subproducts(id, color_names, product_image,
                                        color_codes, hxs)
                item[
                    'image_urls'] = product_image + color_urls + chart + default_images
                self.products['status'][index] = "ran"
            else:
                self.exc.code_handler(102, response.url)
                item['product_id'] = [id]
                item['in_stock'] = ["NOT_AVAILABLE"]
                self.products['status'][index] = "not_avail"
                self.xml.create_xml(item)
        except:
            self.products['status'][index] = "error"
            self.exc.code_handler(100, response.url)
        return item

    # function for checking if product has embroidery or not
    def get_embroidery(self, hxs):
        page = hxs.select('//html').extract()[0]
        if "document.getElementById('logocolor').disabled = true;" in page:
            return ["True"]
        else:
            return ["False"]

    # function for creating json with all information for colors
    def make_colors_json(self, color_urls, color_names, color_codes):
        dict = {}
        jsons = []
        for i in range(0, len(color_urls)):
            dict['color_url'] = self.get_server_path_single(color_urls[i])
            dict['color_name'] = color_names[i]
            dict['color_short'] = color_codes[i]
            json = basic.cdata(simplejson.dumps(dict))
            jsons.append(json)
        return jsons

    # function for getting image server path
    def get_server_path_single(self, url):
        #        return url
        return self.images_store + "/full/" + hashlib.sha1(
            url).hexdigest() + ".jpg"

    # function for getting image path for field of images
    def get_server_path(self, urls):
        #        return urls
        new = []
        for url in urls:
            new.append(self.images_store + "/full/" +
                       hashlib.sha1(url).hexdigest() + ".jpg")
        return new

    #function for getting basic information for product
    def get_basic_info(self, hxs):
        name = hxs.select('//div[@id="proddetail"]/h1/text()').extract()
        price = hxs.select(
            '//div[@id="proddetail"]/div[@class="yourprice bigprice"]/text()'
        ).extract()
        description = basic.cdata(
            hxs.select('//div[@id="details"]').extract()[0])
        description = basic.clean_string(description)
        old_price = hxs.select(
            '//span[@class="yourprice_product"]/text()').extract()
        if not price:
            price = hxs.select('//span[@id="PriceDisplay"]/text()').extract()
        if old_price:
            old_price = [re.sub('[^0-9.]', '', old_price[0])]
        price = [re.sub('[^0-9.]', '', price[0])]
        return name, price, old_price, [description]

    # function for getting rating, both number and sentence (e.g. Rating 5 out of 6 votes)
    def get_rating(self, hxs):
        temp = hxs.select('//div[@id="Customerssay"]/p[2]/text()').extract()
        if temp:
            rating = basic.get_middle_text(temp[0].replace(" ", ""), "Rating:",
                                           "out")
            return rating, temp
        else:
            return [], temp

    #function for getting reviews, returning rating and field of json reviews
    # or empty fields if there's no reviews
    def get_reviews(self, hxs):
        reviews = hxs.select('//div[@class="prodReview"]')
        if reviews:
            title = reviews[0].select(
                'p[@class="review_title"]/text()').extract()
            text = reviews[0].select(
                'p[@class="review_text"]/text()').extract()
            author = reviews[0].select(
                'p[@class="review_author"]/text()').extract()
            location = reviews[0].select(
                'p[@class="review_location"]/text()').extract()
            jsons = self.make_reviews_json(title, text, author, location)
            return jsons
        else:
            return []

    # function for making json for reviews
    # currently not in use. cause there are no reviews in DPW design
    def make_reviews_json(self, title, text, author, location):
        jsons = []
        print len(title)
        print len(text)
        print len(author)
        print len(location)
        os._exit(0)
        for i in range(0, len(title)):
            json = '{ "title" : " %s ", "text" : "%s", "author" : "%s", "location" :\
                    "%s" }' % (title[i], text[i], author[i], location[i])
            json = basic.cdata(json)
            jsons.append(json)
        return jsons

    #function for getting size chart image
    def get_size_image(self, hxs):
        temp = hxs.select(
            '//div[@class="TabbedPanelsContent cells"]/img/@src').extract()
        return temp

    #function for getting image swatches, returning fields (image_urls, image name, product color image)
    def get_image_swatches(self, hxs):
        colors = hxs.select('//div[@class="lolite"]')
        color_images = []
        color_names = []
        products_image = []
        color_codes = []
        for color in colors:
            color_images.append(color.select('a/img/@src').extract()[0])
            color_names.append(color.select('a/img/@alt').extract()[0])
            #if zoom image needed, this is the place to get it
            products_image.append(color.select('a/@rev').extract()[0])
            color_codes.append(
                color.select('a/@onclick').extract()[0].split(",")[1].replace(
                    "'", ""))
        return color_images, color_names, products_image, color_codes

    #function for getting additional images, returns field of images or empty field if there is no
    def get_extra_images(self, hxs):
        additional_images = hxs.select(
            '//div[@id="AddImg"]/script/text()').extract()
        if additional_images:
            temp = basic.get_middle_text(additional_images[0], '"', '"')
            thumb_images = temp[0].split(",")
            return thumb_images
        else:
            return []

    #function for getting product id from the page
    def get_product_id(self, hxs):
        temp = hxs.select('//div[@id="wrap"]/script/text()').extract()
        id = basic.get_middle_text(temp[0], 'productid","', '"')
        return id[0]

    # function for getting sizes from another url, retunrning field of jsons for sizes
    # one id from the page is 115NB, if needed here to hardcode for testing
    # currently not in use
    def get_sizes(self, id, hxs):
        showmode = hxs.select('//input[@name="showmode"]/@value').extract()[0]
        itemmode = hxs.select('//input[@name="itemmode"]/@value').extract()[0]
        salemode = hxs.select('//input[@name="salemode"]/@value').extract()[0]
        url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=AV&opt2=-1&type2=l1type" % (
            id)
        url += "&type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=429" % (
            showmode, itemmode, salemode)
        jsons = []
        print "reading page..."
        page = urllib2.urlopen(url).read()
        print "page read"
        page = page.replace("'", "")
        page = page.replace("[", ",")
        page = page.replace(",,", "")
        temp = page.split("]")
        for i in range(0, len(temp) - 2):
            tmp = temp[i].split(",")
            json = '{ "size_short" : " %s ", "size_full" : "%s", "some_number" :\
                    "%s", "some_id" : "%s" }' % (tmp[0], tmp[1], tmp[2],
                                                 tmp[3])
            json = basic.cdata(json)
            jsons.append(json)
        return jsons

    # function that handles creating subproducts, can be implemented for the usual way product for every combination
    # of size and color if needed
    def create_subproducts(self, id, color_names, product_image, color_codes,
                           hxs):
        item = LydiasItem()
        # if no colors for specific product do this part and call to creating size children with empty string instead
        # of actual color name
        if len(color_names) == 0:
            item['master_product_id'] = [id]
            item['product_id'] = [id + "_" + "0"]
            item['color'] = ["NO_COLOR"]
            item['custom_size'] = self.create_sizes_subproducts(
                id, id + "_" + "0", "", hxs)
            self.xml.create_xml(item)

        # for handling cases when there are color options for specific product, create child for every color, and call
        # for creating size children for every provided color
        else:
            for i in range(0, len(color_names)):
                print "name :" + color_names[i] + "  code:" + color_codes[i]
                item['master_product_id'] = [id]
                item['product_id'] = [id + "_" + str(i)]
                item['color'] = [color_names[i]]
                item['color_short'] = [color_codes[i]]
                item['normal_image_url'] = self.get_server_path(
                    [product_image[i]])
                item['in_stock'] = ["IN_STOCK"]
                item['custom_size'] = self.create_sizes_subproducts(
                    id, id + "_" + str(i), color_codes[i], hxs)
                self.xml.create_xml(item)
                item.clear()
        return 0

    # function for creating child products for sizes
    # little messy with all the commented lines but those lines can be used if needed to go back to old way with
    # child products instead of json
    def create_sizes_subproducts(self, main_id, id, color_code, hxs):
        print color_code
        jsons = []
        # if block for cases when color is provided
        if color_code != "":
            showmode = hxs.select(
                '//input[@name="showmode"]/@value').extract()[0]
            itemmode = hxs.select(
                '//input[@name="itemmode"]/@value').extract()[0]
            salemode = hxs.select(
                '//input[@name="salemode"]/@value').extract()[0]
            url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=%s&opt2=-1&type2=l1type&" \
                "type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=193" % (main_id, color_code, showmode, itemmode, salemode)
            page = urllib2.urlopen(url).read()
            page = page.replace("'", "")
            page = page.replace("[", ",")
            page = page.replace(",,", "")
            temp = page.split("]")
            for i in range(0, len(temp) - 2):
                tmp = temp[i].split(",")
                item = {}
                #                item['master_product_id'] = [id]
                item['size_short'] = tmp[0]
                item['price_url'] = self.get_size_price(
                    str(main_id), str(color_code), tmp[0])
                item['size'] = tmp[1]
                #                item['product_id'] = [id + "_" + str(i)]
                #                item['in_stock'] = ["IN_STOCK"]
                #                xml.create_xml(item)
                jsons.append(basic.cdata(simplejson.dumps(item)))
            return jsons

        # when the color is not provided different block of code cause it's done differently on the page
        else:
            temp = hxs.select('//div[@class="not_size"]/text()').extract()
            for i in range(0, len(temp)):
                item = {}
                #                item['master_product_id'] = [id]
                #                item['product_id'] = [id + "_" + str(i)]
                item['size_short'] = temp[i]
                item['price_url'] = self.get_size_price(
                    str(main_id), "", temp[i])
                #                item['in_stock'] = ["IN_STOCK"]
                #                xml.create_xml(item)
                jsons.append(basic.cdata(simplejson.dumps(item)))
            return jsons

#        return 0

# function for getting price for combination of every size and color, can return url where the price is, or can
# parse that url to get that actual price but will drastically increase scraping time

    def get_size_price(self, id, color, size):
        if color != "":
            url = "http://www.lydiasuniforms.com/ajaxed/product-showprice.asp?sku=%s %s %s&qty=1&itemmode=" \
                  "0&showmode=1&rnum=388" % (str(id), str(color), size)
        else:
            url = "http://www.lydiasuniforms.com/ajaxed/product-showprice.asp?sku=%s %s&qty=1&itemmode=" \
                  "0&showmode=1&rnum=259" % (id, size)
        url = url.replace(" ", "%20")
        return url

    # just adding part for getting absolute paths for relative paths from page
    def absolute_path(self, urls):
        new = []
        for i in urls:
            new.append("http://www.lydiasuniforms.com" + i)
        return new

    # function used for gettin embroidery information from clients page, was used only once to get it
    # cause embroidery is the same for all the products
    def get_emb(self, hxs):
        emb = hxs.select('//div[@id="emb"]').extract()
        lettering_colors = hxs.select(
            '//select[@id="threadcolor"]/option/@value').extract()
        urls = []
        d = {}
        colors = []
        for i in range(1, len(lettering_colors)):
            d['type'] = "lettering colors"
            d['name'] = lettering_colors[i]
            url = "http://www.lydiasuniforms.com/images/lydias/threadcolor_"
            url += lettering_colors[i].lower().replace(' ', '_') + ".gif"
            d['url'] = self.get_server_path_single(url)

            urls.append(url)
            colors.append(basic.cdata(simplejson.dumps(d)))
        lettering = hxs.select(
            '//select[@id="lettering"]/option/@value').extract()
        l = {}
        letterings = []
        for i in range(1, len(lettering)):
            l['type'] = "lettering"
            l['name'] = lettering[i]
            url = "http://www.lydiasuniforms.com/images/lydias/lettering_"
            url += lettering[i].lower().replace(' ', '_') + ".gif"
            l['url'] = self.get_server_path_single(url)
            letterings.append(basic.cdata(simplejson.dumps(l)))
            urls.append(url)
        logo = hxs.select('//select[@id="logoname"]/option/@value').extract()
        logos = {}
        log = []
        for i in range(1, len(logo)):
            logos['type'] = "logo"
            logos['name'] = logo[i]
            url = "http://www.lydiasuniforms.com/images/logos/"
            url += logo[i].lower() + ".jpg"
            logos['url'] = self.get_server_path_single(url)
            urls.append(url)
            log.append(basic.cdata(simplejson.dumps(logos)))
        item = LydiasItem()
        item['color'] = colors
        item['lettering'] = letterings
        item['log'] = log
        xml.create_xml(item)
        xml.write_xml("emb")

        return urls
        print colors, letterings, log
        os._exit(0)

    def handle_not_provided(self):
        item = LydiasItem()
        for n in self.no_urls['product_ids']:
            item['product_id'] = [n]
            index = self.no_urls['product_ids'].index(n)
            item['name'] = [self.no_urls['names'][index]]
            item['in_stock'] = ['NOT_AVAILABLE']
            self.xml.create_xml(item)

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = ""
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter,
                                                         self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        #if self.d['upload']:
        #exp = CommonExport()
        #try:
        #exp.xml_to_db(self.name, filename, "4b0d6b52-7b05-4e54-9d87-dfe77ac270c9")
        #msg += "\n\nExport to database successful"
        #except StandardError:
        #msg += "\n\nExport to database failed"
        #else:
        #msg += "\n\nUpload to database not selected"
        ## part for exporting to database here
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "Lydias: {0}".format(filename))
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15)
            self.products['product_ids'] = xls.read_excel_collumn_for_ids(
                1, 15)
            self.products['names'] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        else:
            self.products = xls.delete_duplicates_dict(self.products)
            self.products, self.no_urls = xls.separate_no_urls(self.products)
            self.products = xls._add_none_status(self.products)
            self.no_urls = xls._add_none_status(self.no_urls)
Ejemplo n.º 9
0
class ChomeSpider(CrawlSpider):
    name = "chome"
    allowed_domains = ["zmags.com"]
    start_urls = ["http://www.zmags.com/"]
    counter = 0

    def __init__(self, *a, **kw):
        super(ChomeSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5)
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(
                self.d['catalog_id'], self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.images_store = "/" + settings['IMAGES_STORE']
        self.total = len(self.no_urls['product_ids'])

    def parse(self, response):
        self.counter += 1
        hxs = HtmlXPathSelector(response)
        item = ChomeItem()
        print "IDs in excel feed: {0}".format(self.total)
        item['image_urls'] = self.parse_whole_xml()
        return item

    def parse_whole_xml(self):
        xml_dir = "xml/{0}".format(self.name)
        file_url = "https://svc.celebratinghome.com/ZMags.svc/ProductInfo1"
        downloader = Downloader()
        if self.d['download']:
            downloader.get_file(xml_dir, file_url, "client_feed")
        else:
            if not os.path.exists('xml/{0}/client_feed.xml'.format(self.name)):
                basic.warning(
                    "Feed file doesn't exist please de-select no download option"
                )
                os._exit(2)
        self.number = 0
        xml_item = ChomeItem()
        urls_all = []
        for event, elem in iterparse('xml/{0}/client_feed.xml'.format(
                self.name)):
            if elem.tag == "{http://schemas.microsoft.com/ado/2007/08/dataservices/metadata}properties":
                for r in elem:
                    p = "{http://schemas.microsoft.com/ado/2007/08/dataservices}"
                    if r.tag == p + "Id" and r.text in self.no_urls[
                            'product_ids']:
                        index = self.no_urls['product_ids'].index(r.text)
                        self.no_urls['status'][index] = 'ran'
                        self.number += 1
                        urls = []
                        flag = 0
                        for x in elem:
                            if x.tag == p + "Id":
                                xml_item['product_id'] = [x.text]
                            elif x.tag == p + "EngLongDesc" and x.text is not None:
                                xml_item['description_english'] = [
                                    self.escape(basic.cdata(x.text))
                                ]
                            elif x.tag == p + "RetailPrice":
                                xml_item['custom_price'] = [x.text[:-2]]
                            elif x.tag == p + "SpnLongDesc" and x.text is not None:
                                xml_item['description_spanish'] = [
                                    self.escape(basic.cdata(x.text))
                                ]
                            elif x.tag == p + "PartNumber":
                                xml_item['add_to_cart_id'] = [x.text]
                            elif x.tag == p + "MaxQty":
                                xml_item['max_qty'] = [x.text]
                            elif x.tag == p + "TimeType":
                                xml_item['time_type'] = [x.text]
                            elif x.tag == p + "SpnName" and x.text is not None:
                                xml_item['name_spanish'] = [x.text]
                            elif x.tag == p + "EngName":
                                xml_item['name_english'] = [x.text]
                            elif x.tag == p + "ImagePath_Large" and x.text is not None:
                                urls.append(self.get_absolute(x.text))
                                xml_item['normal_image_url'] = [
                                    self.get_server_path(
                                        self.get_absolute(x.text))
                                ]
                            elif x.tag == p + "IsActive":
                                if x.text == 0:
                                    xml_item['in_stock'] = ["NOT_IN_STOCK"]
                                else:
                                    xml_item['in_stock'] = ['IN_STOCK']
                            else:
                                for i in range(1, 4):
                                    tag = p + "Alternate%sImagePath_Large" % (
                                        str(i))
                                    if x.tag == tag and x.text is not None:
                                        urls.append(self.get_absolute(x.text))
                                        xml_item['normal_image_url'].append(
                                            self.get_server_path(
                                                self.get_absolute(x.text)))
                                        # change image paths for normal_image_url and return urls
                        self.xml.create_xml(xml_item)
                        urls_all += urls
        for i in range(0, len(self.no_urls['status'])):
            if self.no_urls['status'][i] != 'ran':
                self.no_urls['status'][i] = 'not_found'
        return urls_all

    def get_server_path(self, url):
        path = self.images_store + "/full/" + hashlib.sha1(
            url).hexdigest() + ".jpg"
        return path

    def get_absolute(self, url):
        return "http://www.celebratinghome.com/" + url

    def escape(self, string):
        temp = HTMLParser.HTMLParser().unescape(string)
        return HTMLParser.HTMLParser().unescape(temp)

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}\n".format(datetime.now())
        if self.total - self.number:
            msg += "{0} id(s) from id list weren't found in feed".format(
                self.total - self.number)
            basic.warning(msg)
        else:
            msg += "All ids found in feed."
            basic.green(msg)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.no_urls)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        #if self.d['upload']:
        #exp = CommonExport()
        #try:
        #exp.xml_to_db(self.name, self.d['file'], "40b029c9-dff7-4bc1-b8bc-ef062960b24d")
        #msg += "\n\nExport to database successful"
        #except StandardError:
        #msg += "\n\nExport to database failed"
        #else:
        #msg += "\n\nUpload to database not selected"
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "CelebratingHome: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "CelebratingHome: {0}".format(filename),
                               self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products['product_ids'] = xls.read_excel_collumn_for_ids(
                1, 15)
            self.products['names'] = xls.read_excel_collumn(2, 15)
            self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        self.products = xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

    def add_properties(self, xml):
        xml.add_property("description_english", "Description English", "text")
        xml.add_property("description_spanish", "Description Spanish", "text")
        xml.add_property("add_to_cart_id", "Add To Cart ID", "text")
        xml.add_property("max_qty", "Max Quantity", "text")
        xml.add_property("time_type", "Time Type", "text")
        xml.add_property("name_english", "Name English", "text")
        xml.add_property("name_spanish", "Name Spanish", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("custom_price", "Custom Price", "text")
Ejemplo n.º 10
0
class BurtonSpider(CrawlSpider):
    name = "burton"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(BurtonSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5, "Burton")
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                        self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.handle_not_provided()
        burton.add_properties(self.xml)
        self.start_urls = self.products['urls']
        self.start_urls = ["http://www.dickssportinggoods.com/product/index.jsp?productId=13243074"]
        self.images_store = "/" + settings['IMAGES_STORE']
        self.total = len(self.start_urls)

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = BurtonItem()
        page = hxs.extract()
        if 'redirect_urls' in response.request.meta:
            cur_url = response.request.meta['redirect_urls'][0]
        else:
            cur_url = response.url
        index = self.products['urls'].index(cur_url)
        try:
            if 'redirect_urls' in response.request.meta:
                item['product_id'] = [self.products['product_ids'][index]]
                item['name'] = [self.products['names'][index]]
                item['in_stock'] = ["NOT_AVAILABLE"]
                self.exc.code_handler(102, response.url)
                self.xml.create_xml(item)
                self.products["status"][index] = "no_avail"
            else:
                item['product_id'], item['name'] = self.get_basic_info(hxs)
                item['description'], item['features'] = self.get_description(hxs)
                item['variants'], thumb_urls, color_names = self.get_variants(page)
                item['all_sizes'] = self.get_all_sizes(page)
                item['color_json'], image_urls = self.get_colors(page, color_names)
                item['price'], item['old_price'] = self.get_prices(hxs)
                item['in_stock'] = ['IN_STOCK']
                item['product_link'] = [basic.cdata(response.url)]
                self.xml.create_xml(item)
                item['image_urls'] = image_urls + thumb_urls
                self.products["status"][index] = "ran"
        except:
            self.exc.code_handler(100, response.url)
            self.products["status"][index] = "error"
        else:
            return item

    def handle_not_provided(self):
        item = BurtonItem()
        for n in self.no_urls['product_ids']:
            item['product_id'] = [n]
            index = self.no_urls['product_ids'].index(n)
            item['name'] = [self.no_urls['names'][index]]
            item['in_stock'] = ['NOT_AVAILABLE']
            self.xml.create_xml(item)

    def get_basic_info(self, hxs):
        name = hxs.select('//h1[@class="productHeading"]/text()').extract()
        product_id = hxs.select('//input[@name="productId"]/@value').extract()
        return product_id, name

    def get_server_path(self, url):
        path = self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg"
        return path

    def get_prices(self, hxs):
        price = hxs.select('//div[@class="op"]/text()').extract()
        price = [basic.get_price(price[0])]
        old_price = hxs.select('//span[@class="lp"]/text()').extract()
        if old_price:
            old_price = [basic.get_price(old_price[0])]
        return price, old_price

    def get_description(self, hxs):
        description = hxs.select('//div[@id="FieldsetProductInfo"]/text()').extract()[3]
        features = hxs.select('//div[@id="FieldsetProductInfo"]/ul').extract()
        if features:
            features = [features[0][:2000]]
        return [basic.cdata(description)], basic.cdata_field(features)

    def get_variants(self, page):
        """Gets jsons for colors with all available sizes.
        In json are also fetched all information for sizes that are on the site
        """
        script = basic.get_middle_text(page, 'var skuSizeColorObj = new Array();', '</script>')[0]
        sizes = []
        image_urls = []
        color_names = []
        colors = script.split('skuSizeColorObj')
        for c in range(1, len(colors)):
            temp = basic.get_middle_text(colors[c], '= ', ';')
            # delete swatch image as it obviously won't be needed
            t = simplejson.loads(burton.replace_for_json(temp[0]))
            image_urls.append(t['swatchURL'])
            color_names.append(t['ColorDesc'])
            t['swatchURL'] = self.get_server_path(t['swatchURL'])
            sizes.append(basic.cdata(simplejson.dumps(t)))
        return sizes, image_urls, color_names

    def get_all_sizes(self, page):
        script = basic.get_middle_text(page, 'var distsizeobj=new Array();', 'var indexcolor=0;')[0]
        all_sizes = basic.get_middle_text(script, ']="','";')
        return [basic.cdata(simplejson.dumps(all_sizes))]

    def get_colors(self, page, color_names):
        """Gets color information with images from javascript on the page.
        Returns  json with color name and imagself.images_store = "/" + settings['IMAGES_STORE']e url for that color, and
        returnes filed of image urls that can be used for download later"""
        script = basic.get_middle_text(page, 'var imageMap_0 = new Array();', '</script>')[0]
        colors = basic.get_middle_text(script, '] = ', ';')
        image_urls = []
        colors_json = []
        for i in range(0, len(color_names)):
            color = burton.replace_color_json(colors[i])
            color = simplejson.loads(color)
            color['cname'] = color_names[i]
            color.pop('reg')
            image_urls.append(color['enh'])
            color['enh'] = self.get_server_path(color['enh'])
            colors_json.append(basic.cdata(simplejson.dumps(color)))
        return colors_json, image_urls

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}".format(datetime.now())
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        if self.d['upload']:
            exp = CommonExport()
            try:
                exp.xml_to_db(self.name, filename, "4ea95a81-90fb-49e2-837e-acf5ab58f574")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        # part for exporting to database here
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "Burton: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "Burton: {0}".format(filename), self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15)
            self.products["product_ids"] = xls.read_excel_collumn_for_ids(1, 15)
            self.products["names"] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        self.products= xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)
Ejemplo n.º 11
0
class KennethSpider(CrawlSpider):
    name = "kenneth"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(KennethSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.images_store = "/" + settings['IMAGES_STORE'] + "/"
        self.d = terminal.get_arguments()
        self.xml = VariantsXml()
        self.exc = ZmagsException(5)
        print self.d
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                        self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.no_url_products(self.no_urls)
        self.start_urls = self.products['urls'] 
        self.total = len(self.start_urls)

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = KennethItem()
        #main try for script, run general except if error happens in code (send
        # url on mail where it happened)
        try:
            cur_url = response.url
                # search for noResultContent div on the page, if it exists keep
                # track, that product doesn't exist on
                # their page, otherwise continue scraping page
            available = hxs.select('//div[@id="noResultsContent"]').extract()

            if not available:
                index = self.products['urls'].index(cur_url)
                cur_id = self.get_product_id(cur_url)
                id = self.products['product_ids'][index]
                page = hxs.select('//div[@id="mainContent"]').extract()
                page = " ".join(page)
                item['name'], item['description'] = self.get_basic_info(hxs)
                price, new_p, old_p = self.get_prices(hxs)
                if new_p:
                    item['new_price'] = new_p
                    item['old_price'] = old_p
                else:
                    item['price'] = price
                desc = basic.clean_string(item['description'][0])
                item['description'] = [desc]
                urls = self.get_color_image(hxs)
                new = self.get_image_server_path(urls, id)
                item['color_image_urls'] = new
                self.export(item['color_image_urls'], [id], "swatchImage")
                jsons, images = self.we_also_recommend(cur_id, id)
                item['product_page'] = [cur_url]
                item['product_id'] = [id]
                item['add_to_cart_id'] = [cur_id]
                item['recommended_product'] = jsons
                item['in_stock'] = ["IN_STOCK"]
                self.products['status'][index] = "ran"
                images_or_404 = self.get_colors(hxs, page, id)
                if images_or_404 == 404:
                    item['in_stock'] = ["NOT_AVAILABLE"]
                self.xml.create_xml(item)
                item['image_urls'] = []
                if images_or_404 != 404:
                    item['image_urls'] += images_or_404
                item['image_urls'] += urls
                item['image_urls'] += images
                #self.export(item['image_urls'])
                #item['image_urls'] = [] #uncomment for donwloading images 

            else:
                # part for handling products that are not available
                cur_id = self.get_product_id(cur_url)
                cur_url = "http://www.kennethcole.com/product/index.jsp?"
                cur_url += "productId=" + str(cur_id)
                index = self.products['urls'].index(cur_url)
                self.products['status'][index] = "no_avail"
                item['product_id'] = [self.products['product_ids'][index]]
                if self.products['product_ids'][index]:
                    item['name'] = [self.products['names'][index]]
                else:
                    item['name'] = ["not available"]
                item['in_stock'] = ["NOT_AVAILABLE"]
                self.xml.create_xml(item)
                self.exc.code_handler(102, cur_url)
        except:
            # part for catching errors and keeping track of numbers of
            # it and urls where it happened
            print "Error occured scraping this product"
            index = self.products['urls'].index(cur_url)
            self.products['status'][index] = "error"
            self.exc.code_handler(100, cur_url)
        return item

    def no_url_products(self, no_url):
        item = KennethItem()
        for n in no_url['product_ids']:
            item['product_id'] = [n]
            index = no_url['product_ids'].index(n)
            item['name'] = [no_url['names'][index]]
            item['in_stock'] = ['NOT_AVAILABLE']
            self.xml.create_xml(item)

    #function for getting basic product info from the page
    def get_basic_info(self, hxs):
        name = hxs.select('//div[@id="productInfoTop"]/h1/text()').extract()
        description = basic.cdata(hxs.select('//div[@id="productDescription"]').extract()[0])
        return name, [description]

    # function for getting prices from the page, nly one or new and old one if
    # that's the case
    def get_prices(self, hxs):
        price = hxs.select('//div[@id="productInfoTop"]/h2/text()').extract()[0]
        new_p = hxs.select('//h2[@class="sale-now"]/text()').extract()
        old_p = hxs.select('//span[@class="productGrey"]/text()').extract()
        price = re.sub('[^0-9.,]', '', price)
        return [price], new_p, old_p

    def get_color_image(self, hxs):
        return hxs.select('//div[@id="productInfoR2W"]/img/@src').extract()

    # function for gettng colors from javascript on the page, and writing them
    # in xml, from here is called function
    # for creating further sizes subproducts
    def get_colors(self, hxs, page, main_id):
        item = KennethItem()
        try:
            tmp = page.split('displays[0]')[1]
        except IndexError:
            print "This product is not available"
            return 404
        script = tmp.split('</script>')[0]
        displays = script.split("};")
        global counter
        ids = []
        images = []
        color_ids = []
        sizes_script = self.get_sizes_part_page(page)
        color_internal_code = {}

        for x in range(0, len(displays) - 1):
            id = basic.get_middle_text(displays[x], 'colorId: "', '"')
            ids.append(id[0])
            reg = displays[x].count("Reg")
            images_in = []
            for i in range(1, reg + 1):
                image = basic.get_middle_text(displays[x], "vw" + str(i) + 'Reg: "', '"')
                if len(image) == 0:
                    image = basic.get_middle_text(displays[x], "vw" + str(i) + 'Reg:"', '"')
                if (len(image) > 0):
                    if (image[0] != "null"):
                        images_in.append(image[0])

            if not images_in:
                images_in = hxs.select('//input[@name="productImage"]/@value').extract()
            color_ids.append(str(main_id) + "_" + str(x))
            item['product_id'] = [str(main_id) + "_" + str(x)]
            item['color_option_id'] = id
            item['master_product_id'] = [main_id]
            item['normal_image_url'] = self.get_image_server_path(images_in, main_id)
            item['thumb_image_url'] = self.get_image_server_path_thumb(images_in, main_id)
            item['in_stock'] = ["NOT_IN_STOCK"]
            item['color'] = self.get_color_name(sizes_script, id[0])
            color_internal_code[id[0]] = str(x)
            self.xml.create_xml(item)
            images += images_in
            self.export(item['normal_image_url'], item['product_id'], "productImage")
        self.get_sizes(sizes_script, ids, main_id, color_internal_code)
        return images

    # function for getting sizes for products from javascript, and storing 
    # information in dicts of format {id : information}
    def get_sizes(self, page, ids, main_id, color_internal_code):
        options = page.split("};")
        skus = {}
        colors_name = {}
        inStocks = {}
        sizes = {}
        prices = {}
        for x in range(0, len(options) - 1):
            id = basic.get_middle_text(options[x], 'cId: "', '"')
            for i in range(0, len(ids)):
                if (id[0] == ids[i]):
                    sku = basic.get_middle_text(options[x], 'sku: ', ',s')
                    sku = re.sub("[^0-9]", "", sku[0])
                    skus = self.add_to_dict(skus, ids[i], sku)
                    size = basic.get_middle_text(options[x], 'sDesc: "', '"')
                    sizes = self.add_to_dict(sizes, ids[i], size[0])
                    price = basic.get_middle_text(options[x], 'price: "', '"')
                    price = self.clean_price(price[0])
                    prices = self.add_to_dict(prices, ids[i], price[0])
                    available = basic.get_middle_text(options[x], 'avail: "', '"')
                    inStocks = self.add_to_dict(inStocks, ids[i], available[0])
        self.create_subproducts_xml(main_id, color_internal_code, colors_name, sizes, skus, inStocks, prices)
        return main_id, colors_name, sizes, skus, inStocks, prices

    # function for creating subproducts for every size
    def create_subproducts_xml(self, main_id, color_internal_code, colors_name, sizes, skus, inStocks, prices):
        number = 0
        global counter
        for k, v in sizes.iteritems():
            item = KennethItem()
            for i in range(0, len(v)):
                item['size'] = [v[i]]
                item['size_option_id'] = [skus[k][i]]
                m_id = main_id + "_" + color_internal_code[k]
                item['master_product_id'] = [m_id]
                id = m_id + "_" + str(i)
                item['product_id'] = [id]
                if inStocks[k][i] == "NOT_AVAILABLE":
                    item['in_stock'] = ["NOT_IN_STOCK"]
                elif inStocks[k][i] == "ADVANCED_SALE_LIMITED":
                    item['in_stock'] = ["IN_STOCK"]
                else:
                    item['in_stock'] = [inStocks[k][i]]
                item['price'] = [prices[k][i]]
                #item['color'] = colors_name[k]
                self.xml.create_xml(item)
            number += 1

    def add_to_dict(self, dict, index, value):
        try:
            dict[index].append(value)
        except:
            dict[index] = [value]
        return dict

    # function for getting we also recommend information about products from
    # their page, returns json list with information and images
    # list with images urls
    def we_also_recommend(self, id, main_id):
        url = "http://www.res-x.com/ws/r2/Resonance.aspx?appid=kennethcole01&t"
        url += "k=154212870918247&ss=525178103419747&sg=1&pg=897706724574618&b"
        url += "x=true&vr=2.67&sc=product_rr&ev=product&ei=" + id + "&cu=&ct=k"
        url += "ennethcolec01&no=3&cb=r1eh&clk=&cv1=" + id + "&cv23=63&ur=http%"
        url += "3A//www.kennethcole.com/product/index.jsp%3FproductId%3D3" + id
        url += "&plk=&rf="
        import urllib2
        page = urllib2.urlopen(url).read()
        temp = page.split("certonaRecBoxes")
        images = []
        ids = []
        names = []
        prices = []
        urls = []
        # parsing data got from the upper url about we also recommend products
        for i in range(1, len(temp)):
            id = [basic.get_middle_text(temp[i], "d=", '\\"')[0]]
            image = basic.get_middle_text(temp[i], 'src=\\"', '\\"')[0]
            name = basic.get_middle_text(temp[i], 'alt=\\"', '\\"')
            price = basic.get_middle_text(temp[i], '<br>', '</a>')
            url = "http://www.kennethcole.com/product/index.jsp?productId="
            url += id[0]
            urls.append(url)
            ids.append(id)
            names.append(name)
            prices.append(price)
            images.append(image)
        jsons = self.make_json(ids, names, prices, self.get_image_server_path(images, main_id), urls)
        return jsons, images

    # function for getting product id from the url
    def get_product_id(self, url):
        return url.split("=")[1]

    #function for making json
    def make_json(self, ids, names, prices, images, urls):
        jsons = []
        for i in range(0, len(ids)):
            json = "{" + ' "id" : "' + str(ids[i][0]) + '", '
            json += '"name" : "' + str(names[i][0]) + '", '
            # insert function for storing the right image path
            json += '"image_url" : "' + str(images[i]) + '", '
            json += '"product_url" : "' + urls[i] + '", '
            json += '"price" : "' + str(prices[i][0]) + '" } '
            json = basic.cdata(json)
            jsons.append(json)
        return jsons

    #function for getting javascript where sizes are handled
    def get_sizes_part_page(self, page):
        tmp = page.split("availDates = new Array();")[1]
        script = tmp.split("</script>")[0]
        return script

    # function for getting name of the color by id
    def get_color_name(self, script, id):
        temp = script.split(id)
        temp = temp[0].split('cDesc: "')
        temp = temp[len(temp) - 1]
        name = temp.split('"')[0]
        return [name]
        return {id: name}

    #function for exporting images to database via rest
    def export(self, images, id, tags):
        #set override to 0 for uploading images or else to skip uploading
        override = 1
        if override == 0:
            import MultipartPostHandler
            import urllib2
            import os
            url = 'http://api.admin.zmags.com/productImage/import?key=5ef90922-283b-4412-a1c8-3e70bc28b9d3'

            for i in range(0, len(images)):
                image_name = self.get_image_name(images[i])
                path = "images/kenneth_images/small/" + str(image_name)
                params = {'file': file(path, 'rb'), 'product_id': id[0],
                          'index': str(i + 1), 'tags': tags}
                          #token not working
                opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
                code = opener.open(url, params).getcode()

                if (code != 202):
                    print ("Achtung")
                global images_number
                images_number += 1
                print images_number

                print "Image uploaded to product " + id[0]
        else:
            #print "Image upload overriden.."
            pass

    #function for getting image name from url
    def get_image_server_path(self, urls, id):
#        print urls
        new = []
        for url in urls:
            temp = url.split("/")
            new.append(self.images_store + id + "/full/" + temp[len(temp) - 1])
        return new

    # function for getting image paths on our server
    def get_image_server_path_thumb(self, urls, id):
        new = []
        for url in urls:
            temp = url.split("/")
            new.append(self.images_store + id + "/small/" + temp[len(temp) - 1])
        return new

    def clean_price(self, price):
        return [re.sub('[^0-9.,]', '', price)]

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = ""
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped {0} product out of {1}\n\n".format(self.counter, self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        if self.d['upload']:
            exp = CommonExport()
            #try:
            exp.xml_to_db(self.name, filename, "29eac9ea-8c57-4d22-baf4-3f1471dc3ab6")
            msg += "\n\nExport to database successful"
            #except StandardError:
                #msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "KennethCole: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "KennethCole: {0}".format(filename), self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = 'logs/{0}'.format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products['urls'] = xls.read_excel_collumn_for_urls(2, 2)
            self.products['product_ids'] = xls.read_excel_collumn_for_ids(0, 2)
            self.products['names'] = xls.read_excel_collumn(1, 2)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        self.products = xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

    def add_properties(self, xml):
        xml.add_property("add_to_cart_id", "Add To Cart Id", "text")
        xml.add_property("product_page", "Product page", "text")
        xml.add_property("color_image_urls", "Color Image URLs", "text_list")
        xml.add_property("color_option_id", "Color Option ID", "text")
        xml.add_property("recommended_product", "Recommended Product", "text_list")
        xml.add_property("size_option_id", "Size Option ID", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("old_price", "Old Price", "text")
        xml.add_property("new_price", "New Price", "text")
Ejemplo n.º 12
0
class SportmanSpider(CrawlSpider):
    name = "sportman"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(SportmanSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5, "Sportmann")

        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(
                self.d['catalog_id'], self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.start_urls = self.products['urls']
        self.images_store = "/" + settings['IMAGES_STORE']
        self.total = len(self.start_urls)

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = SportmanItem()
        if 'redirect_urls' in response.request.meta:
            cur_url = response.request.meta['redirect_urls'][0]
        else:
            cur_url = response.url
        index = self.products['urls'].index(cur_url)
        try:
            if 'redirect_urls' in response.request.meta:
                item['product_id'] = [self.products['product_ids'][index]]
                item['name'] = [self.products['names'][index]]
                item['in_stock'] = ["NOT_AVAILABLE"]
                self.exc.code_handler(102, response.url)
                self.xml.create_xml(item)
                self.products["status"][index] = "no_avail"
            else:
                item["name"], item["short_desc"], item["description"], item[
                    "old_price"], item["custom_price"], item[
                        "product_id"], item["sku"] = self.get_basic_info(hxs)
                item['in_stock'] = ['IN_STOCK']
                viewstate, eventval, prevpage, hidden, view_page, even_page, pre_page, hidd_page = self.get_vars(
                    response, hxs)

                viewstate1 = viewstate[:2000]
                viewstate2 = viewstate[2000:4000]
                viewstate3 = viewstate[4000:6000]
                viewstate4 = viewstate[6000:8000]
                viewstate5 = viewstate[8000:10000]
                viewstate6 = viewstate[10000:]

                item["viewstate1"] = [basic.cdata(viewstate1)]
                item["viewstate2"] = [basic.cdata(viewstate2)]
                item["viewstate3"] = [basic.cdata(viewstate3)]
                item["viewstate4"] = [basic.cdata(viewstate4)]
                item["viewstate5"] = [basic.cdata(viewstate5)]
                item["viewstate6"] = [basic.cdata(viewstate6)]
                item["eventval"] = [basic.cdata(eventval)]
                item["size_options"] = self.get_variants(hxs, response)

                images_url = self.get_images(hxs)

                item["normal_image_url"] = self.get_server_path(images_url)

                self.xml.create_xml(item)
                item.clear()
                item['image_urls'] = self.get_images(hxs)
                self.products["status"][index] = "ran"
        except:
            self.exc.code_handler(100, response.url)
            self.products["status"][index] = "error"
        else:
            return item

    def get_basic_info(self, hxs):
        name = hxs.select('//div[@id="fragment-1"]/h2/text()').extract()

        short_desc = hxs.select(
            '//div[@class="description2"]/text()').extract()

        description = hxs.select(
            '//div[@id="fragment-1"]/div[@class="description"]').extract()
        description = sportman.delete_tags(re, description[0])
        description = [basic.cdata(description)]

        old_price = hxs.select('//span[@class="oldprice"]/text()').extract()
        if (old_price != []):
            old_price = " ".join(old_price)
            old_price = old_price.split(':')
            old_price = old_price[1].replace('Kr', '')
            old_price = [old_price.replace(" ", "")]
        else:
            old_price = old_price

        price = hxs.select('//span[@class="nowprice"]/text()').extract()
        if (price != []):
            price = " ".join(price)
            price = price.split(':')
            price = price[1].replace('Kr', '')
            price = [price.replace(" ", "")]
        else:
            price = hxs.select('//span[@class="normalprice"]/text()').extract()
            price = " ".join(price)
            price = price.split(':')
            price = price[1].replace('Kr', '')
            price = [price.replace(" ", "")]

        id = hxs.select('//div[@class="articlenumber"]').extract()
        id = " ".join(id)
        id = id.replace(u"\xa0", "")
        id = basic.get_middle_text(id, 'Art.nr.', '</div>')
        sku = id
        id = [id[0]]

        return name, short_desc, description, old_price, price, id, sku

    def get_vars(self, response, hxs):
        headers1 = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 5.1; rv:13.0) Gecko/20100101 Firefox/13.0.1',
            'Host':
            'www.sportmann.no',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language':
            'en-us,en;q=0.5',
            'Accept-Charset':
            'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
            'Connection':
            'keep-alive',
            'Referer':
            '/product.aspx?productid=613232',
            'Cookie':
            'ASP.NET_SessionId=lurvsvrn3jxsfd45cedmsv45; Besok=922884e3-e9cb-4b69-b8c8-215f3cc988a9; __utma=184084580.1353376623.1312483243.1312483243.1312483243.1; __utmb=184084580.9.10.1312483243; __utmc=184084580; __utmz=184084580.1312483243.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)'
        }

        page = hxs.select('//html').extract()
        page = " ".join(page)

        viewst = basic.get_middle_text(page, 'id="__VIEWSTATE" value="', '"')
        eventval = basic.get_middle_text(page,
                                         'id="__EVENTVALIDATION" value="', '"')
        prevpage = [""]
        hidden_field = [""]

        r = requests.get(response.url, headers=headers1)

        page_one = r.content

        viewst_page = basic.get_middle_text(page_one,
                                            'id="__VIEWSTATE" value="', '"')
        eventval_page = basic.get_middle_text(
            page_one, 'id="__EVENTVALIDATION" value="', '"')
        prevpage_page = basic.get_middle_text(page_one,
                                              'id="__PREVIOUSPAGE" value="',
                                              '"')
        hidden_temp = page_one.split('id="__VIEWSTATE"')
        hidden_temp = hidden_temp[1].split('id="__PREVIOUSPAGE"')
        hidden_temp = hidden_temp[0].split('<script sr')

        val_x = len(hidden_temp) - 1

        hidden_temp = basic.get_middle_text(hidden_temp[val_x], 'c="', '"')
        hidden_temp_val = hidden_temp[0]
        hidden_temp_val = hidden_temp_val.replace('amp;', '')
        hidden_url = "http://www.sportmann.no" + hidden_temp_val

        request_hidden = urllib2.Request(hidden_url)
        response_hidden = urllib2.urlopen(request_hidden)
        hidden_field_page = basic.get_middle_text(
            response_hidden.read(),
            "ctl00_ScriptManager1_HiddenField').value += '", "';")

        return viewst[0], eventval[0], prevpage[0], hidden_field[
            0], viewst_page[0], eventval_page[0], prevpage_page[
                0], hidden_field_page[0]

    def get_variants(self, hxs, response):
        page = hxs.select('//html').extract()
        page = " ".join(page)
        dict_one = {}
        test_one = []

        temp = page.split('<div class="color">')
        temp = temp[1].split('</div>')
        temp = temp[0].split('<select name')

        viewstate, eventvalidation, previouspage, hiddenfield, view_page, even_page, pre_page, hidd_page = self.get_vars(
            response, hxs)

        if (len(temp) == 1):
            color = hxs.select('//div[@class="color"]/text()').extract()
            value = hxs.select(
                '//input[@id="ctl00_ContentPlaceHolder1_Variant1Hidden"]/@value'
            ).extract()
            color[0] = color[0].replace("  ", "")
            color = basic.clean_string(color[0])
            value = value[0]

        #            color = basic.clean_string(color[0])
        #            color = color.replace("  ","")
        #
        #            dict['color'] = color
        #            dict['color_value'] = value[0]

        else:
            test_color = basic.get_middle_text(temp[1], 'farge</option>',
                                               '</select>')
            color = basic.get_middle_text(test_color[0], '">', '</option>')
            value = basic.get_middle_text(test_color[0], 'value="', '">')

            for i in range(0, len(color)):
                color[i] = color[i].replace("  ", "")
            #
            #                dict['color'] = color
            #                dict['color_value'] = value

        size_temp = page.split('<div class="size">')
        size_temp = size_temp[1].split('</div>')
        size_temp = size_temp[0].split('<select name')

        if (len(size_temp) == 1):
            size = hxs.select('//div[@class="size"]/text()').extract()
            size = basic.clean_string(size[0])
            size = [size.replace("   ", "")]

            size_val = hxs.select(
                '//input[@id="ctl00_ContentPlaceHolder1_Variant2Hidden"]/@value'
            ).extract()

            if size[0] == "":
                for i in range(len(value)):
                    resp_page = self.get_data(response, hidd_page, view_page,
                                              pre_page, even_page, value[i])

                    a_page = resp_page.split('<div class="siz')
                    a_page = a_page[1].split('</select>')

                    if len(a_page) == 1:

                        size = basic.get_middle_text(a_page[0], 'e">',
                                                     '<input type="hidden"')
                        size_val = basic.get_middle_text(
                            a_page[0], 'value="', '"')
                        size_val = size_val[0]
                        size_val = [size_val]

                    else:
                        a_page = basic.get_middle_text(a_page[0],
                                                       'se</option>',
                                                       '</select>')
                        size = basic.get_middle_text(a_page[0], '">',
                                                     '</option>')
                        size_val = basic.get_middle_text(
                            a_page[0], 'value="', '">')

                    dict_one["color"] = color[i]
                    dict_one["color_value"] = value[i]
                    dict_one["size_value"] = size_val

                    for x in range(0, len(size)):
                        size[x] = basic.clean_string(size[x])
                        size[x] = size[x].replace("   ", "")

                        dict_one["size"] = size

                    test_one.append(basic.cdata(json.dumps(dict_one)))

            else:
                dict_one["color"] = color

                dict_one["color_value"] = value
                dict_one['size'] = size
                dict_one['size_value'] = size_val
                test_one.append(basic.cdata(simplejson.dumps(dict_one)))

        else:
            test_size = basic.get_middle_text(size_temp[1], 'se</option>',
                                              '</select>')
            size = basic.get_middle_text(test_size[0], '">', '</option>')
            size_val = basic.get_middle_text(test_size[0], 'value="', '">')

            for x in range(0, len(size)):
                size[x] = basic.clean_string(size[x])
                size[x] = size[x].replace("   ", "")

            dict_one["color"] = color
            dict_one["color_value"] = value
            dict_one['size'] = size
            dict_one['size_value'] = size_val

            test_one.append(basic.cdata(json.dumps(dict_one)))

        return test_one

    def get_server_path(self, url):
        images_array = []
        for i in range(0, len(url)):
            url[i] = basic.clean_string(url[i])

            images_array.append(self.images_store + "/full/" +
                                hashlib.sha1(url[i]).hexdigest() + ".jpg")

        return images_array

    def get_images(self, hxs):
        page = hxs.select('//html').extract()
        page = " ".join(page)

        images = []

        temp = page.split('class="gallery_demo_unstyled"')
        temp = temp[1].split('<div class="right_container">')
        temp = basic.get_middle_text(temp[0], 'src="', '"')

        for i in range(0, len(temp)):
            image_url = "http://www.sportmann.no" + temp[i]
            images.append(image_url)

        return images

    def get_data(self, response, hidden, viewstate, previouspage,
                 eventvalidation, colorvalue):
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0',
            'Host': 'www.sportmann.no',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-us,en;q=0.5',
            'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
            'Connection': 'keep-alive',
            'Referer': 'http://www.sportmann.no/product.aspx?productid=613232',
            'Cookie': ''
        }

        eventvalidation = urllib.urlencode(
            {"__EVENTVALIDATION": eventvalidation})
        viewstate = urllib.urlencode({"__VIEWSTATE": viewstate})
        previouspage = urllib.urlencode({"__PREVIOUSPAGE": previouspage})
        hidden = urllib.urlencode({"ctl00_ScriptManager1_HiddenField": hidden})

        data = "ctl00%24ScriptManager1=ctl00%24ContentPlaceHolder1%24dropdownPanel%7Cctl00%24ContentPlaceHolder1%24ddlVariant&" + hidden + "%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0&__EVENTTARGET=ctl00%24ContentPlaceHolder1%24ddlVariant&__EVENTARGUMENT=&__LASTFOCUS=&" + viewstate + "&" + previouspage + "&" + eventvalidation + "&ctl00%24ProductSearch%24txtProdSearch=&ctl00%24ProductSearch%24TextBoxWatermarkProdSearch_ClientState=&ctl00%24ContentPlaceHolder1%24ddlVariant=" + colorvalue + "&ctl00%24ContentPlaceHolder1%24Variant1Hidden=&ctl00%24ContentPlaceHolder1%24Variant2Hidden=&ctl00%24ContentPlaceHolder1%24tbAmount=1&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtFriendsName=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceFriendsName_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtFriendsEmail=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceFriendsEmail_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtYourName=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceYourName_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtYourEmail=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceYourEmail_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtComment=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceComment_ClientState=&__ASYNCPOST=true&"

        #r = requests.get(response.url, h)
        req = urllib2.Request(response.url, data, headers)

        resp_page = urllib2.urlopen(req).read()

        return resp_page

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}".format(datetime.now())
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter,
                                                         self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        if self.d['upload']:
            exp = CommonExport()
            try:
                exp.xml_to_db(self.name, filename,
                              "1ccd39a5-af4e-47cc-aebe-e0dede5b14d8")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "Sportmann: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "Sportmann: {0}".format(filename),
                               self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15)
            self.products["product_ids"] = xls.read_excel_collumn_for_ids(
                1, 15)
            self.products["names"] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        self.products = xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

    def add_properties(self, xml):
        xml.add_property("short_desc", "Short Description", "text")
        xml.add_property("old_price", "Old Price", "text")
        xml.add_property("custom_price", "New Price", "text")
        xml.add_property("color_value", "Color Value", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("size_val", "Size Value", "text_list")
        xml.add_property("sku", "Sku", "text")
        xml.add_property("size_options", "Size_options", "text_list")
        xml.add_property("viewstate1", "Viewstate1", "text_list")
        xml.add_property("viewstate2", "Viewstate2", "text_list")
        xml.add_property("viewstate3", "Viewstate3", "text_list")
        xml.add_property("viewstate4", "Viewstate4", "text_list")
        xml.add_property("viewstate5", "Viewstate5", "text_list")
        xml.add_property("viewstate6", "Viewstate6", "text_list")
        xml.add_property("eventval", "Eventval", "text_list")
        xml.add_property("hidden", "Hidden Field", "text_list")
        xml.add_property("prevpage", "Previous Page", "text_list")
        xml.add_property("recommended_product", "Recommended Product",
                         "text_list")