コード例 #1
0
 def __init__(self, *a, **kw):
     super(ExpressSpider, self).__init__(*a, **kw)
     dispatcher.connect(self.spider_closed, signals.spider_closed)
     terminal = CommonTerminal(sys.argv, self.name)
     self.log = Logger()
     self.d = terminal.get_arguments()
     self.xml = VariantsXml()
     self.exc = ZmagsException(5)
     shops = CreateShops(self.d['file'], self.xml)
     try:
         shops.get()
     except IndexError:
         print "This sheet has no shop look or line"
     self.get_lists_from_excel()
     self.add_properties(self.xml)
     self.start_urls = self.url_list[:2]
     self.total = len(self.start_urls)
コード例 #2
0
class ExpressSpider(CrawlSpider):
    name = "express"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    temp_msg = ""
    handle_httpstatus_list = [404]
    counter = 0

    def __init__(self, *a, **kw):
        super(ExpressSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = CommonTerminal(sys.argv, self.name)
        self.log = Logger()
        self.d = terminal.get_arguments()
        self.xml = VariantsXml()
        self.exc = ZmagsException(5)
        shops = CreateShops(self.d['file'], self.xml)
        try:
            shops.get()
        except IndexError:
            print "This sheet has no shop look or line"
        self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.start_urls = self.url_list[:2]
        self.total = len(self.start_urls)

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = ExpressItem()
        index = self.url_list.index(response.url)
        self.url_list[index] = self.counter
        flag = 0
        shop_look = 0
        # main try that catches all unhandled errors
        try:
            hxs = HtmlXPathSelector(response)
            if response.url != "http://www.zmags.com/":
                error_404 = hxs.select('//img[@alt="404 Error Page Not Found"]').extract()
                flag = 1
                if not error_404:
                    flag = 1
                    available = hxs.select('//span[@class="glo-tex-error"]/text()').extract()
                    page = " ".join(hxs.select('//html').extract())
                    #part for creating main product in xml
                    id = self.get_product_id(hxs)[0]
                    if id != self.id_list[index]:
                        msg = "\nNot equal, id in sheet {0}, on site {1}".format(self.id_list[index], id)
                        self.temp_msg += msg
                    item['product_id'] = [id]
                    item['name'] = self.get_name(hxs)
                    item['description'], item['promo_text'] = self.get_basic_info(hxs)
                    item['master_price'], item['discount_price'] = self.get_product_prices(hxs)
                    item['shop_look'] = ['False']
                    item['normal'] = ['True']
                    item['shop_line'] = ['False']
                    item['in_stock'] = ["NOT_IN_STOCK"]
                    if available[0] != "This item is no longer available for purchase.":
                        item['category_id'], item['subcategory_id'] = self.get_categories(hxs)
                        item['add_to_cart_id'] = self.get_add_to_cart_id(hxs)
                        color_names, urls, swatch_image_names, jsons = self.get_swatch_images(hxs)
                        #urls = basic.cdata_field(self.map_url_to_server(urls, id, True))
                        item['color_image_url'] = self.create_color_json(urls, color_names)
                        item['in_stock'] = ["IN_STOCK"]
                        item['product_page'] = [response.url]
                        self.xml.create_xml(item)
                        product_images, images_grouped = self.parse_jsons(jsons, color_names)
                        ids, sizes, prices = self.get_variants(page)
                        # calling function that will handle creating all child products
                        self.create_child_products(id, ids, sizes, prices, images_grouped)
                        item['image_urls'] = urls + product_images
                        if self.shop_look_list[index]:
                            self.parse_for_shop_look(hxs, self.shop_look_list[index],
                                                     id, page, images_grouped, response.url, index)
                        if self.shop_line_list[index]:
                            self.parse_for_shop_look(hxs, self.shop_line_list[index],
                                                     id, page, images_grouped, response.url, index)
                    else:
                        self.xml.create_xml(item)
                        self.exc.code_handler(102, response.url)

                else:
                    self.exc.code_handler(104, response.url)
            else:
                basic.not_provided()
                self.exc.code_handler(101, response.url)
            if not flag:
                item['product_id'] = [self.id_list[index]]
                item['in_stock'] = ["NOT_AVAILABLE"]
                item['name'] = ["not available"]
                self.xml.create_xml(item)
        except StandardError:
            self.exc.code_handler(100, response.url)
        #if it's last product write xml and run end_operations
        return item

    def parse_for_shop_look(self, hxs, id, product_id, page, images_grouped, product_url, index):
        """Special parse function for shop looks and lines.
        It gets same info stored in different format, mostly json and reference
        to master product id that is actually shop look/line id.
        TO DO: see if there is need to specially handle the case
        for not available"""
        item = ExpressItem()
        item['master_product_id'] = [id]
        item['product_id'] = [id + "_" + product_id]
        if self.ordered:
            item['order_index'] = [self.order_list[index]]
        item['style'] = [product_id]
        item['product_page'] = [product_url]
        item['category_id'], item['subcategory_id'] = self.get_categories(hxs)
        item['add_to_cart_id'] = self.get_add_to_cart_id(hxs)
        # below is part fot creating swatch images and images json
        color_names, urls, swatch_image_names, jsons = self.get_swatch_images(hxs)
        i = 0
        colors = []
        for k in color_names:
            d = {'name': k, 'swatch_url': urls[i], 'image_url': self.get_absolute_url(images_grouped[k])}
            i += 1
            colors.append(simplejson.dumps(d))
        item['colors'] = basic.cdata_field(colors)
        item['price'], item['discount_price'] = self.get_product_prices(hxs)
        item['description'], item['promo_text'] = self.get_basic_info(hxs)
        item['name'] = self.get_name(hxs)
        # below is part for creating variants json
        ids, sizes, prices = self.get_variants(page)
        variants = []
        for k in ids:
            d = {'color': k, 'prices': prices[k], 'ids': ids[k]}
            try:
                d['sizes'] = sizes[k]
            except StandardError:
                print "This product has no sizes"
            variants.append(simplejson.dumps(d))
        item['variants'] = basic.cdata_field(variants)
        self.xml.create_xml(item)

    def parse_shop_look(self, hxs):
        products = hxs.select('//div[@id="cat-ens-prod-item"]')
        i = 0
        # do this with actual id
        item = ExpressItem()
        whole_page = hxs.extract()
        whole_page = "".join(whole_page)
        ensemble_id = basic.get_middle_text(whole_page, "ensembleId: '", "',")
        name = hxs.select('//div[@id="cat-ens-prod-con"]/h1/text()').extract()
        name = basic.clean_string_field(name)
        item['ensemble_id'] = ensemble_id
        item['normal_image_url'] = self.shl_get_image(hxs)
        item['product_id'] = ["DUMMIE1"]
        item['shop_look'] = ['True']
        item['normal'] = ['False']
        item['shop_line'] = ['False']
        item['in_stock'] = ['IN_STOCK']
        item['name'] = name
        xml.create_xml(item)
        item.clear()
        for p in products:
            i += 1
            item = ExpressItem()
            item['master_product_id'] = ['DUMMIE1']
            item['product_id'] = ["DUMMIE1_" + str(i)]
            item['name'], item['price'], item['style'] = self.shl_basic_info(p)
            page = p.extract()
            item['variants'] = basic.cdata_field([self.shl_create_variants(self.get_variants(page))])
            item['colors'] = basic.cdata_field(self.shl_get_swatches(p))
            xml.create_xml(item)
        # return images for download here once it's needed

    def get_categories(self, hxs):
        category_id = hxs.select('//input[@name="categoryId"]/@value').extract()
        sub_category_id = hxs.select('//input[@name="subCategoryId"]/@value').extract()
        return category_id, sub_category_id

    def get_add_to_cart_id(self, hxs):
        return hxs.select('//input[@name="productId"]/@value').extract()

    def shl_get_image(self, hxs):
        page = hxs.extract()
        image = basic.get_middle_text(page, 'imagesets = "', '";')
        image = "http://t.express.com/com/scene7/s7d5/=/is/image/expressfashion/%s/i81" % (image[0])
        return [image]

    def shl_create_variants(self, f):
        """Creates variants for shop look products.
        Stored in dict with all info and returned as json"""
        d_main = {}
        n = []
        colors = [p for p in f[0]]
        for c in colors:
            d = {'color': c, 'ids': f[0][c]}
            try:
                d['sizes'] = f[1][c]
            except StandardError:
                print "This product has no sizes"
            d['prices'] = f[2][c]
            n.append(d)
        d_main['variants'] = n
        return simplejson.dumps(n)

    def shl_get_swatches(self, hxs):
        """Function for getting swatches for shop look way.
        Stores information in dict (name, swatch_url and image url)"""
        p = hxs.select('div[@class="cat-ens-prod-info"]/div[@class="cat-ens-prod-swatch-display"]')
        p = p.select('span/text()').extract()
        l = []
        d = {}
        for c in p:
            temp = c.split(",")
            d['name'] = temp[0]
            d['swatch_url'] = temp[1]
            d['image_url'] = temp[2]
            l.append(simplejson.dumps(d))
        return l

    def shl_basic_info(self, hxs):
        name = hxs.select('div[@class="cat-ens-prod-info"]/h1/text()').extract()
        name = basic.clean_string_field(name)
        price = hxs.select('div[@class="cat-ens-prod-info"]/span/text()').extract()
        price = basic.clean_spaces_field(basic.clean_string_field(price))
        style = hxs.select('div[@class="cat-ens-prod-info"]/text()').extract()
        if len(style) > 2:
            style = [basic.clean_string(style[1])]
        else:
            style = []
        return name, price, style

    def create_color_json(self, urls, names):
        d = {}
        n = []
        for i in range(0, len(urls)):
            d['url'] = urls[i]
            d['name'] = names[i]
            n.append(simplejson.dumps(d))
        return n

    def get_basic_info(self, hxs):
        """Gets basic info about products.
        Returns description and promo text"""
        description = hxs.select('//li[@class="cat-pro-desc"]').extract()[0]
        description = basic.clean_string(description)
        description = [basic.cdata(description)]
        promo_text = hxs.select('//span[@class="cat-pro-promo-text"]/text()').extract()
        if not promo_text:
            promo_text = hxs.select('//span[@class="cat-pro-promo-text"]/font').extract()
        if promo_text:
            promo_text = basic.cdata_field(promo_text)
        return description, promo_text

    def get_name(self, hxs):
        name = hxs.select('//div[@id="cat-pro-con-detail"]/h1/text()').extract()[0]
        name = [basic.clean_string(name)]
        return name

    def get_product_prices(self, hxs):
        """Gets product prices, regular and discount if it exists.
        If no discount returns empty field."""
        price = hxs.select('//li[@class="cat-pro-price"]/strong/text()').extract()
        discount_price = []
        if not price:
            price = hxs.select('//li[@class="cat-pro-price"]/span[@class="cat-glo-tex-oldP"]/text()').extract()
            discount_price = hxs.select('//li[@class="cat-pro-price"]/span[@class="cat-glo-tex-saleP"]/text()').extract()
        if discount_price:
            discount_price = [re.sub('[^0-9.,]', '', discount_price[0])]
        price = [re.sub('[^0-9.,]', '', price[0])]
        return price, discount_price

    def get_product_id(self, hxs):
        """Gets product sku from the page as a field"""
        sku = hxs.select('//input[@name="omnitureStyleID"]/@value').extract()[0]
        sku = sku.replace(";", "")
        return [sku]

    def get_swatch_images(self, hxs):
        """Function for getting swatch images info (names, urls, image names and urls).
        Also it gets and json as list of json urls for getting images set for every color."""
        urls = hxs.select('//li[@id="widget-product-swatches"]/a/img/@src').extract()
        color_names = hxs.select('//li[@id="widget-product-swatches"]/a/img/@alt').extract()
        swatch_image_names = self.get_swatch_image_name(urls)
        if not swatch_image_names and not color_names:
            color_names.append("no_color")
            swatch_image_names = self.get_imagesets(hxs)
        jsons = self.get_json(swatch_image_names)
        return color_names, urls, swatch_image_names, jsons

    def get_imagesets(self, hxs):
        """Function for getting image set in case where there is no color for product.
        Gets image set info from the javascript on the page and selects only first one,
        if there is more because there is only one color to associate with (no_color)"""
        page = hxs.extract()
        print len(page)
        iset = basic.get_middle_text(page, 'imagesets = "', '"; //Change')
        iset = iset[0].split(',')
        return [iset[0]]

    def get_swatch_image_name(self, image_sites):
        """Gets swatch image name from swatch image url"""
        image_names = []
        for x in range(0, len(image_sites)):
            name = basic.get_middle_text(image_sites[x], "fashion/", "_s")[0]
            image_names.append(name)
        return image_names

    def get_json(self, image_names):
        """Gets list of jsons from list of swatch images names"""
        jsons = []
        for i in range(0, len(image_names)):
            json = "http://s7d5.scene7.com/is/image/expressfashion/" + image_names[i] + "?req=imageset,json"
            jsons.append(json)
        return jsons

    def parse_jsons(self, jsons, color_names):
        """Parsing json from json urls.
        Returning all images in field, also returns them grouped by colors,
        so those groups can be used later when creating child products in xml"""
        images = []
        images_grouped = {}
        for i in range(0, len(jsons)):
            json = urllib2.urlopen(jsons[i]).read()
            image = basic.get_middle_text(json, '"expressfashion/', ";")
            rest_of_images = basic.get_middle_text(json, ',expressfashion/', ";")
            temp = image + rest_of_images
            images_grouped = basic.add_to_dict(images_grouped, color_names[i], temp)
            images += temp
        return self.get_absolute_url(images), images_grouped

    def get_absolute_url(self, images):
        """Gets absolute path for images.
        Receives field of relative path images and returns absolute paths"""
        image_urls = []
        for x in range(0, len(images)):
            image_url = "http://s7d5.scene7.com/is/image/expressfashion/" + images[x]
            image_url += "?width=351"
            image_urls.append(image_url)
        return image_urls

    def get_variants(self, page):
        """Getting variants from javascript on the page.
        Returns three dicts ids, sizes and prices. Format of the dicts is like
        (key = color, value = field of (ids, sizes and prices))"""
        temp = page.split("// Load the product variants")[1]
        temp = temp.split("// Set the field to update with the product variant")[0]
        variants = temp.split("// Create the variant")
        sizes = {}
        ids = {}
        prices = {}
        for i in range(1, len(variants)):
            color = basic.get_middle_text(variants[i], "Color','", "')")
            if color:
                color = color[0]
            else:
                color = "no_color"
            ids = basic.add_to_dict(ids, color, basic.get_middle_text(variants[i], "setId('", "')")[0])
            if variants[i].find("Size','") != -1:
                sizes = basic.add_to_dict(sizes, color, basic.get_middle_text(variants[i], "Size','", "')")[0])
            prices = basic.add_to_dict(prices, color, basic.get_middle_text(variants[i], 'numericPrice="', '"')[0])
        return ids, sizes, prices

    def get_image_url(self, images, is_swatch=False):
        """Returns path for images on our servers.
        If it's for swatch it return also swatch paths."""
        image_paths = []
        thumb_paths = []
        for x in range(0, len(images)):
            path = normal_image_url + images[x] + ".jpg"
            thumb_path = thumb_image_url + images[x] + ".jpg"
            image_paths.append(path)
            thumb_paths.append(thumb_path)
        if is_swatch is True:
            return image_paths
        else:
            return image_paths, thumb_paths

    def create_child_products(self, main_id, ids, sizes, prices, images_grouped):
        """Creating child products (both colors and sizes).
        Arguments it gets are: main_id as product id of the master product,
        images_grouped that is a dict of images grouped by color (field i field)
        and dicts ids, sizes and prices (e.g. dict with color names as keys and
        fields of ids for it as values 'black': ['32854, '32855''])"""
        item = ExpressItem()
        i = 0
        for k in ids:
            cur_id = main_id + "_" + chr(i + 97)
            item['product_id'] = [cur_id]
            item['master_product_id'] = [main_id]
            item['color'] = [k]
            # use this for some other path (our server)
#            images, thumbs = self.get_image_url(images_grouped[i])
            if images_grouped:
                images = self.get_absolute_url(images_grouped[k])
    #            item['normal_image_url'], item['thumb_image_url'] = self.map_url_to_server(images,main_id)
                item['normal_image_url'] = basic.cdata_field(self.map_url_to_server(images, main_id))
            self.xml.create_xml(item)
            item.clear()
            j = 0
            for val in ids[k]:
                item['product_id'] = [cur_id + "_" + chr(j + 97)]
                item['master_product_id'] = [cur_id]
                if len(sizes):
                    item['size'] = [sizes[k][j]]
                item['size_option_id'] = [ids[k][j]]
                item['price'] = [prices[k][j]]
                self.xml.create_xml(item)
                j += 1
            i += 1

    def map_url_to_server(self, urls, main_id, is_swatch=False):
        return urls
        new = []
        new1 = []
        for i in range(0, len(urls)):
            new.append(image_path + "/" + main_id + "/full/" + hashlib.sha1(urls[i]).hexdigest() + ".jpg")
            if is_swatch is False:
                new1.append(image_path + "/" + main_id + "/thumb/" + hashlib.sha1(urls[i]).hexdigest() + ".jpg")
        if is_swatch is True:
            return new
        else:
            return new, new1

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = ""
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total)
        # filename for writing xml
        self.xml.write_xml(self.name, self.d['file'])
        msg += self.exc.create_message(self.counter)
        msg += "\n{0}".format(self.temp_msg)
        exp = CommonExport()
        # part for exporting to database here
        if self.d['upload']:
            try:
                exp.xml_to_db(self.name, self.d['file'], "e2b3b658-16d5-4059-a9df-3c212c817d2c")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        msg += self.log.get_message()
        from modules.mail import Mail
        mail = Mail()
        mail.send_mail(msg, "Express scraper report")

    def get_lists_from_excel(self):
        xls = CommonExcel(basic.get_excel_path(self.name, self.d['file']))
        self.ordered = True
        try:
            self.url_list = xls.read_excel_collumn_for_urls(4, 1)
            self.id_list = xls.read_excel_collumn_for_ids(0, 1)
            self.shop_look_list = xls.read_excel_collumn(1, 1)
            self.shop_line_list = xls.read_excel_collumn(2, 1)
            try:
                self.order_list = xls.read_excel_collumn_for_ids(6, 1)
            except:
                self.ordered = False
                self.log.add_message("No order provided in this sheet.")
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)

    def add_properties(self, xml):
        xml.add_property("size_option_id", "Size Option Id", "text")
        xml.add_property("color_image_url", "Color Image Url", "text_list")
        xml.add_property("colors", "Colors", "text_list")
        xml.add_property("variants", "Variants", "text_list")
        xml.add_property("style", "Style", "text")
        xml.add_property("mode", "Mode", "text")
        xml.add_property("shop_look", "Shop look", "boolean")
        xml.add_property("shop_line", "Shop line", "boolean")
        xml.add_property("normal", "Normal", "boolean")
        xml.add_property("ensemble_id", "Ensemble ID", "text")
        xml.add_property("promo_text", "Promo text", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("product_page", "Product page", "text")
        xml.add_property("master_price", "Master Price", "decimal")
        xml.add_property("subcategory_id", "Sub Category ID", "text")
        xml.add_property("add_to_cart_id", "Add to cart ID", "text")
        xml.add_property("order_index", "Order Index", "integer")