Python CommonXml Exemples, modules.zmags_xml.CommonXml Python Exemples

Exemple #1

0

Afficher le fichier

 def __init__(self, *a, **kw):
     super(ChomeSpider, self).__init__(*a, **kw)
     dispatcher.connect(self.spider_closed, signals.spider_closed)
     terminal = DatabaseTerminal(sys.argv, self.name)
     self.d = terminal.get_arguments()
     self.xml = CommonXml()
     self.exc = ZmagsException(5)
     if self.d['database']:
         self.database = Database()
         self.database.connect()
         self.products, self.no_urls = self.database.select_products(
             self.d['catalog_id'], self.d['product_id'])
         self.database.disconnect()
     else:
         self.get_lists_from_excel()
     self.add_properties(self.xml)
     self.images_store = "/" + settings['IMAGES_STORE']
     self.total = len(self.no_urls['product_ids'])

Exemple #2

0

Afficher le fichier

 def __init__(self, *a, **kw):
     super(GuitarCenterSpider, self).__init__(*a, **kw)
     dispatcher.connect(self.spider_closed, signals.spider_closed)
     terminal = DatabaseTerminal(sys.argv, self.name)
     self.d = terminal.get_arguments()
     self.xml = CommonXml()
     self.exc = ZmagsException(5)
     if self.d['database']:
         self.database = Database()
         self.database.connect()
         self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                     self.d['product_id'])
         self.database.disconnect()
     else:
         self.get_lists_from_excel()
     self.add_properties(self.xml)
     self.handle_not_provided()
     self.start_urls = self.products['urls']
     self.total = len(self.products['urls'])

Exemple #3

0

Afficher le fichier

Fichier : burton_spider.py Projet : marjevtic/testMarko

 def __init__(self, *a, **kw):
     super(BurtonSpider, self).__init__(*a, **kw)
     dispatcher.connect(self.spider_closed, signals.spider_closed)
     terminal = DatabaseTerminal(sys.argv, self.name)
     self.d = terminal.get_arguments()
     self.xml = CommonXml()
     self.exc = ZmagsException(5, "Burton")
     if self.d['database']:
         self.database = Database()
         self.database.connect()
         self.products, self.no_urls = self.database.select_products(
             self.d['catalog_id'], self.d['product_id'])
         self.database.disconnect()
     else:
         self.get_lists_from_excel()
     self.handle_not_provided()
     burton.add_properties(self.xml)
     self.start_urls = self.products['urls']
     self.start_urls = [
         "http://www.dickssportinggoods.com/product/index.jsp?productId=13243074"
     ]
     self.images_store = "/" + settings['IMAGES_STORE']
     self.total = len(self.start_urls)

Exemple #4

0

Afficher le fichier

Fichier : partylite_spider.py Projet : marjevtic/testMarko

 def __init__(self, *a, **kw):
     super(PartyliteSpider, self).__init__(*a, **kw)
     dispatcher.connect(self.spider_closed, signals.spider_closed)
     terminal = PartyliteTerminal(sys.argv, self.name)
     self.d = terminal.get_arguments()
     self.images_store = "/" + settings['IMAGES_STORE']
     self.users = party.get_users(settings, self.d)
     self.exc = ZmagsException(50)
     self.production = self.d['env']
     self.upload = self.d['upload']
     self.english = self.d['lang']
     self.file_name = self.d['file']
     if self.d['database']:
         self.database = Database()
         self.database.connect()
         self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                     self.d['product_id'])
         self.database.disconnect()
         self.change_url_list()
     else:
         self.get_lists_from_excel()
     self.xml = CommonXml()
     party.add_properties(self.xml)
     self.total = len(self.products['urls'])

Exemple #5

0

Afficher le fichier

Fichier : guitar_center_spider.py Projet : marjevtic/testMarko

 def __init__(self, *a, **kw):
     super(GuitarCenterSpider, self).__init__(*a, **kw)
     dispatcher.connect(self.spider_closed, signals.spider_closed)
     terminal = DatabaseTerminal(sys.argv, self.name)
     self.d = terminal.get_arguments()
     self.xml = CommonXml()
     self.exc = ZmagsException(5)
     if self.d["database"]:
         self.database = Database()
         self.database.connect()
         self.products, self.no_urls = self.database.select_products(self.d["catalog_id"], self.d["product_id"])
         self.database.disconnect()
     else:
         self.get_lists_from_excel()
     self.add_properties(self.xml)
     self.handle_not_provided()
     self.start_urls = self.products["urls"]
     self.total = len(self.products["urls"])

Exemple #6

0

Afficher le fichier

Fichier : chome_spider.py Projet : marjevtic/testMarko

 def __init__(self, *a, **kw):
     super(ChomeSpider, self).__init__(*a, **kw)
     dispatcher.connect(self.spider_closed, signals.spider_closed)
     terminal = DatabaseTerminal(sys.argv, self.name)
     self.d = terminal.get_arguments()
     self.xml = CommonXml()
     self.exc = ZmagsException(5)
     if self.d['database']:
         self.database = Database()
         self.database.connect()
         self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                     self.d['product_id'])
         self.database.disconnect()
     else:
         self.get_lists_from_excel()
     self.add_properties(self.xml)
     self.images_store = "/" + settings['IMAGES_STORE']
     self.total = len(self.no_urls['product_ids'])

Exemple #7

0

Afficher le fichier

Fichier : sportman_spider.py Projet : marjevtic/testMarko

    def __init__(self, *a, **kw):
        super(SportmanSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5, "Sportmann")

        if self.d["database"]:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d["catalog_id"], self.d["product_id"])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.start_urls = self.products["urls"]
        self.images_store = "/" + settings["IMAGES_STORE"]
        self.total = len(self.start_urls)

Exemple #8

0

Afficher le fichier

Fichier : burton_spider.py Projet : marjevtic/testMarko

 def __init__(self, *a, **kw):
     super(BurtonSpider, self).__init__(*a, **kw)
     dispatcher.connect(self.spider_closed, signals.spider_closed)
     terminal = DatabaseTerminal(sys.argv, self.name)
     self.d = terminal.get_arguments()
     self.xml = CommonXml()
     self.exc = ZmagsException(5, "Burton")
     if self.d['database']:
         self.database = Database()
         self.database.connect()
         self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                     self.d['product_id'])
         self.database.disconnect()
     else:
         self.get_lists_from_excel()
     self.handle_not_provided()
     burton.add_properties(self.xml)
     self.start_urls = self.products['urls']
     self.start_urls = ["http://www.dickssportinggoods.com/product/index.jsp?productId=13243074"]
     self.images_store = "/" + settings['IMAGES_STORE']
     self.total = len(self.start_urls)

Exemple #9

0

Afficher le fichier

class SportmanSpider(CrawlSpider):
    name = "sportman"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(SportmanSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5, "Sportmann")

        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(
                self.d['catalog_id'], self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.start_urls = self.products['urls']
        self.images_store = "/" + settings['IMAGES_STORE']
        self.total = len(self.start_urls)

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = SportmanItem()
        if 'redirect_urls' in response.request.meta:
            cur_url = response.request.meta['redirect_urls'][0]
        else:
            cur_url = response.url
        index = self.products['urls'].index(cur_url)
        try:
            if 'redirect_urls' in response.request.meta:
                item['product_id'] = [self.products['product_ids'][index]]
                item['name'] = [self.products['names'][index]]
                item['in_stock'] = ["NOT_AVAILABLE"]
                self.exc.code_handler(102, response.url)
                self.xml.create_xml(item)
                self.products["status"][index] = "no_avail"
            else:
                item["name"], item["short_desc"], item["description"], item[
                    "old_price"], item["custom_price"], item[
                        "product_id"], item["sku"] = self.get_basic_info(hxs)
                item['in_stock'] = ['IN_STOCK']
                viewstate, eventval, prevpage, hidden, view_page, even_page, pre_page, hidd_page = self.get_vars(
                    response, hxs)

                viewstate1 = viewstate[:2000]
                viewstate2 = viewstate[2000:4000]
                viewstate3 = viewstate[4000:6000]
                viewstate4 = viewstate[6000:8000]
                viewstate5 = viewstate[8000:10000]
                viewstate6 = viewstate[10000:]

                item["viewstate1"] = [basic.cdata(viewstate1)]
                item["viewstate2"] = [basic.cdata(viewstate2)]
                item["viewstate3"] = [basic.cdata(viewstate3)]
                item["viewstate4"] = [basic.cdata(viewstate4)]
                item["viewstate5"] = [basic.cdata(viewstate5)]
                item["viewstate6"] = [basic.cdata(viewstate6)]
                item["eventval"] = [basic.cdata(eventval)]
                item["size_options"] = self.get_variants(hxs, response)

                images_url = self.get_images(hxs)

                item["normal_image_url"] = self.get_server_path(images_url)

                self.xml.create_xml(item)
                item.clear()
                item['image_urls'] = self.get_images(hxs)
                self.products["status"][index] = "ran"
        except:
            self.exc.code_handler(100, response.url)
            self.products["status"][index] = "error"
        else:
            return item

    def get_basic_info(self, hxs):
        name = hxs.select('//div[@id="fragment-1"]/h2/text()').extract()

        short_desc = hxs.select(
            '//div[@class="description2"]/text()').extract()

        description = hxs.select(
            '//div[@id="fragment-1"]/div[@class="description"]').extract()
        description = sportman.delete_tags(re, description[0])
        description = [basic.cdata(description)]

        old_price = hxs.select('//span[@class="oldprice"]/text()').extract()
        if (old_price != []):
            old_price = " ".join(old_price)
            old_price = old_price.split(':')
            old_price = old_price[1].replace('Kr', '')
            old_price = [old_price.replace(" ", "")]
        else:
            old_price = old_price

        price = hxs.select('//span[@class="nowprice"]/text()').extract()
        if (price != []):
            price = " ".join(price)
            price = price.split(':')
            price = price[1].replace('Kr', '')
            price = [price.replace(" ", "")]
        else:
            price = hxs.select('//span[@class="normalprice"]/text()').extract()
            price = " ".join(price)
            price = price.split(':')
            price = price[1].replace('Kr', '')
            price = [price.replace(" ", "")]

        id = hxs.select('//div[@class="articlenumber"]').extract()
        id = " ".join(id)
        id = id.replace(u"\xa0", "")
        id = basic.get_middle_text(id, 'Art.nr.', '</div>')
        sku = id
        id = [id[0]]

        return name, short_desc, description, old_price, price, id, sku

    def get_vars(self, response, hxs):
        headers1 = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 5.1; rv:13.0) Gecko/20100101 Firefox/13.0.1',
            'Host':
            'www.sportmann.no',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language':
            'en-us,en;q=0.5',
            'Accept-Charset':
            'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
            'Connection':
            'keep-alive',
            'Referer':
            '/product.aspx?productid=613232',
            'Cookie':
            'ASP.NET_SessionId=lurvsvrn3jxsfd45cedmsv45; Besok=922884e3-e9cb-4b69-b8c8-215f3cc988a9; __utma=184084580.1353376623.1312483243.1312483243.1312483243.1; __utmb=184084580.9.10.1312483243; __utmc=184084580; __utmz=184084580.1312483243.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)'
        }

        page = hxs.select('//html').extract()
        page = " ".join(page)

        viewst = basic.get_middle_text(page, 'id="__VIEWSTATE" value="', '"')
        eventval = basic.get_middle_text(page,
                                         'id="__EVENTVALIDATION" value="', '"')
        prevpage = [""]
        hidden_field = [""]

        r = requests.get(response.url, headers=headers1)

        page_one = r.content

        viewst_page = basic.get_middle_text(page_one,
                                            'id="__VIEWSTATE" value="', '"')
        eventval_page = basic.get_middle_text(
            page_one, 'id="__EVENTVALIDATION" value="', '"')
        prevpage_page = basic.get_middle_text(page_one,
                                              'id="__PREVIOUSPAGE" value="',
                                              '"')
        hidden_temp = page_one.split('id="__VIEWSTATE"')
        hidden_temp = hidden_temp[1].split('id="__PREVIOUSPAGE"')
        hidden_temp = hidden_temp[0].split('<script sr')

        val_x = len(hidden_temp) - 1

        hidden_temp = basic.get_middle_text(hidden_temp[val_x], 'c="', '"')
        hidden_temp_val = hidden_temp[0]
        hidden_temp_val = hidden_temp_val.replace('amp;', '')
        hidden_url = "http://www.sportmann.no" + hidden_temp_val

        request_hidden = urllib2.Request(hidden_url)
        response_hidden = urllib2.urlopen(request_hidden)
        hidden_field_page = basic.get_middle_text(
            response_hidden.read(),
            "ctl00_ScriptManager1_HiddenField').value += '", "';")

        return viewst[0], eventval[0], prevpage[0], hidden_field[
            0], viewst_page[0], eventval_page[0], prevpage_page[
                0], hidden_field_page[0]

    def get_variants(self, hxs, response):
        page = hxs.select('//html').extract()
        page = " ".join(page)
        dict_one = {}
        test_one = []

        temp = page.split('<div class="color">')
        temp = temp[1].split('</div>')
        temp = temp[0].split('<select name')

        viewstate, eventvalidation, previouspage, hiddenfield, view_page, even_page, pre_page, hidd_page = self.get_vars(
            response, hxs)

        if (len(temp) == 1):
            color = hxs.select('//div[@class="color"]/text()').extract()
            value = hxs.select(
                '//input[@id="ctl00_ContentPlaceHolder1_Variant1Hidden"]/@value'
            ).extract()
            color[0] = color[0].replace("  ", "")
            color = basic.clean_string(color[0])
            value = value[0]

        #            color = basic.clean_string(color[0])
        #            color = color.replace("  ","")
        #
        #            dict['color'] = color
        #            dict['color_value'] = value[0]

        else:
            test_color = basic.get_middle_text(temp[1], 'farge</option>',
                                               '</select>')
            color = basic.get_middle_text(test_color[0], '">', '</option>')
            value = basic.get_middle_text(test_color[0], 'value="', '">')

            for i in range(0, len(color)):
                color[i] = color[i].replace("  ", "")
            #
            #                dict['color'] = color
            #                dict['color_value'] = value

        size_temp = page.split('<div class="size">')
        size_temp = size_temp[1].split('</div>')
        size_temp = size_temp[0].split('<select name')

        if (len(size_temp) == 1):
            size = hxs.select('//div[@class="size"]/text()').extract()
            size = basic.clean_string(size[0])
            size = [size.replace("   ", "")]

            size_val = hxs.select(
                '//input[@id="ctl00_ContentPlaceHolder1_Variant2Hidden"]/@value'
            ).extract()

            if size[0] == "":
                for i in range(len(value)):
                    resp_page = self.get_data(response, hidd_page, view_page,
                                              pre_page, even_page, value[i])

                    a_page = resp_page.split('<div class="siz')
                    a_page = a_page[1].split('</select>')

                    if len(a_page) == 1:

                        size = basic.get_middle_text(a_page[0], 'e">',
                                                     '<input type="hidden"')
                        size_val = basic.get_middle_text(
                            a_page[0], 'value="', '"')
                        size_val = size_val[0]
                        size_val = [size_val]

                    else:
                        a_page = basic.get_middle_text(a_page[0],
                                                       'se</option>',
                                                       '</select>')
                        size = basic.get_middle_text(a_page[0], '">',
                                                     '</option>')
                        size_val = basic.get_middle_text(
                            a_page[0], 'value="', '">')

                    dict_one["color"] = color[i]
                    dict_one["color_value"] = value[i]
                    dict_one["size_value"] = size_val

                    for x in range(0, len(size)):
                        size[x] = basic.clean_string(size[x])
                        size[x] = size[x].replace("   ", "")

                        dict_one["size"] = size

                    test_one.append(basic.cdata(json.dumps(dict_one)))

            else:
                dict_one["color"] = color

                dict_one["color_value"] = value
                dict_one['size'] = size
                dict_one['size_value'] = size_val
                test_one.append(basic.cdata(simplejson.dumps(dict_one)))

        else:
            test_size = basic.get_middle_text(size_temp[1], 'se</option>',
                                              '</select>')
            size = basic.get_middle_text(test_size[0], '">', '</option>')
            size_val = basic.get_middle_text(test_size[0], 'value="', '">')

            for x in range(0, len(size)):
                size[x] = basic.clean_string(size[x])
                size[x] = size[x].replace("   ", "")

            dict_one["color"] = color
            dict_one["color_value"] = value
            dict_one['size'] = size
            dict_one['size_value'] = size_val

            test_one.append(basic.cdata(json.dumps(dict_one)))

        return test_one

    def get_server_path(self, url):
        images_array = []
        for i in range(0, len(url)):
            url[i] = basic.clean_string(url[i])

            images_array.append(self.images_store + "/full/" +
                                hashlib.sha1(url[i]).hexdigest() + ".jpg")

        return images_array

    def get_images(self, hxs):
        page = hxs.select('//html').extract()
        page = " ".join(page)

        images = []

        temp = page.split('class="gallery_demo_unstyled"')
        temp = temp[1].split('<div class="right_container">')
        temp = basic.get_middle_text(temp[0], 'src="', '"')

        for i in range(0, len(temp)):
            image_url = "http://www.sportmann.no" + temp[i]
            images.append(image_url)

        return images

    def get_data(self, response, hidden, viewstate, previouspage,
                 eventvalidation, colorvalue):
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0',
            'Host': 'www.sportmann.no',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-us,en;q=0.5',
            'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
            'Connection': 'keep-alive',
            'Referer': 'http://www.sportmann.no/product.aspx?productid=613232',
            'Cookie': ''
        }

        eventvalidation = urllib.urlencode(
            {"__EVENTVALIDATION": eventvalidation})
        viewstate = urllib.urlencode({"__VIEWSTATE": viewstate})
        previouspage = urllib.urlencode({"__PREVIOUSPAGE": previouspage})
        hidden = urllib.urlencode({"ctl00_ScriptManager1_HiddenField": hidden})

        data = "ctl00%24ScriptManager1=ctl00%24ContentPlaceHolder1%24dropdownPanel%7Cctl00%24ContentPlaceHolder1%24ddlVariant&" + hidden + "%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0&__EVENTTARGET=ctl00%24ContentPlaceHolder1%24ddlVariant&__EVENTARGUMENT=&__LASTFOCUS=&" + viewstate + "&" + previouspage + "&" + eventvalidation + "&ctl00%24ProductSearch%24txtProdSearch=&ctl00%24ProductSearch%24TextBoxWatermarkProdSearch_ClientState=&ctl00%24ContentPlaceHolder1%24ddlVariant=" + colorvalue + "&ctl00%24ContentPlaceHolder1%24Variant1Hidden=&ctl00%24ContentPlaceHolder1%24Variant2Hidden=&ctl00%24ContentPlaceHolder1%24tbAmount=1&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtFriendsName=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceFriendsName_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtFriendsEmail=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceFriendsEmail_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtYourName=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceYourName_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtYourEmail=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceYourEmail_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtComment=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceComment_ClientState=&__ASYNCPOST=true&"

        #r = requests.get(response.url, h)
        req = urllib2.Request(response.url, data, headers)

        resp_page = urllib2.urlopen(req).read()

        return resp_page

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}".format(datetime.now())
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter,
                                                         self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        if self.d['upload']:
            exp = CommonExport()
            try:
                exp.xml_to_db(self.name, filename,
                              "1ccd39a5-af4e-47cc-aebe-e0dede5b14d8")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "Sportmann: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "Sportmann: {0}".format(filename),
                               self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15)
            self.products["product_ids"] = xls.read_excel_collumn_for_ids(
                1, 15)
            self.products["names"] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        self.products = xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

    def add_properties(self, xml):
        xml.add_property("short_desc", "Short Description", "text")
        xml.add_property("old_price", "Old Price", "text")
        xml.add_property("custom_price", "New Price", "text")
        xml.add_property("color_value", "Color Value", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("size_val", "Size Value", "text_list")
        xml.add_property("sku", "Sku", "text")
        xml.add_property("size_options", "Size_options", "text_list")
        xml.add_property("viewstate1", "Viewstate1", "text_list")
        xml.add_property("viewstate2", "Viewstate2", "text_list")
        xml.add_property("viewstate3", "Viewstate3", "text_list")
        xml.add_property("viewstate4", "Viewstate4", "text_list")
        xml.add_property("viewstate5", "Viewstate5", "text_list")
        xml.add_property("viewstate6", "Viewstate6", "text_list")
        xml.add_property("eventval", "Eventval", "text_list")
        xml.add_property("hidden", "Hidden Field", "text_list")
        xml.add_property("prevpage", "Previous Page", "text_list")
        xml.add_property("recommended_product", "Recommended Product",
                         "text_list")

Exemple #10

0

Afficher le fichier

Fichier : guitar_center_spider.py Projet : marjevtic/testMarko

class GuitarCenterSpider(CrawlSpider):
    name = "guitar_center"
    allowed_domains = ["musiciansfriend.com"]
    start_urls = ["http://www.musiciansfriend.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(GuitarCenterSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5)
        if self.d["database"]:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d["catalog_id"], self.d["product_id"])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.handle_not_provided()
        self.start_urls = self.products["urls"]
        self.total = len(self.products["urls"])

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = GuitarCenterItem()
        from scrapy.conf import settings

        if "redirect_urls" in response.request.meta:
            cur_url = response.request.meta["redirect_urls"][0]
        else:
            cur_url = response.url
        index = self.products["urls"].index(cur_url)
        try:
            item["product_id"] = [self.products["product_ids"][index]]
            item["name"], item["brand"] = self.get_basic_info(hxs)
            item["heading"], item["details"], item["specs"], item["call_to_action"] = self.get_description(hxs)
            item["brand_image"], item["brand_image_promo"], brand_images = self.get_description_images(hxs)
            item["old_price"], item["discount"], item["price"] = self.get_prices(hxs)
            item["image_json"], img = self.get_images(hxs)
            item["serial"] = self.get_serials(hxs)
            item["warranty"] = self.gold_coverage(hxs)
            item["in_stock"] = self.get_available(hxs)
            item["product_ref"], item["add_to_cart_id"] = self.get_add_to_cart(hxs)
            if not item["add_to_cart_id"]:
                item["in_stock"] = ["NOT_AVAILABLE"]
            item["shipping"] = self.get_shipping(hxs)
            item["colors"] = self.get_colors(hxs)
            self.products["status"][index] = "ran"
        except StandardError:
            self.products["status"][index] = "error"
            self.exc.code_handler(100, response.url)
        else:
            self.xml.create_xml(item)
            item["image_urls"] = img + brand_images
        return item

    def handle_not_provided(self):
        item = GuitarCenterItem()
        for n in self.no_urls["product_ids"]:
            item["product_id"] = [n]
            index = self.no_urls["product_ids"].index(n)
            item["name"] = [self.no_urls["names"][index]]
            item["in_stock"] = ["NOT_AVAILABLE"]
            self.xml.create_xml(item)

    def get_basic_info(self, hxs):
        name = hxs.select('//h1[@class="fn"]/text()').extract()
        name = [basic.clean_string("".join(name))]
        brand = hxs.select('//span[@class="brand"]/text()').extract()
        name = [name[0].replace(u"\xa0", "")]
        return name, brand

    def get_description_images(self, hxs):
        brand_image = hxs.select('//a[@class="brandImage"]/img/@src').extract()
        brand_image_promo = hxs.select('//div[@class="brandPromoLogo"]/img/@src').extract()
        images = brand_image + brand_image_promo
        if brand_image:
            brand_image = [self.get_server_path(brand_image[0])]
        if brand_image_promo:
            brand_image_promo = [self.get_server_path(brand_image_promo[0])]
        return brand_image, brand_image_promo, images

    def get_description(self, hxs):
        heading = hxs.select('//div[@id="description"]/p').extract()
        details = hxs.select('//p[@class="description"]').extract()
        specs = hxs.select('//div[@class="specs"]/ul').extract()
        last = hxs.select('//div[@class="callToAction"]/p/text()').extract()
        return basic.cdata_field(heading), basic.cdata_field(details), basic.cdata_field(specs), basic.cdata_field(last)

    # function for getting prices, returns tags and values or empty field if no option for one of them new is discount
    def get_prices(self, hxs):
        tag = hxs.select('//dl[@class="lineItemList"]/dt/text()').extract()
        value = hxs.select('//dl[@class="lineItemList"]/dd/text()').extract()
        old_price = []
        discount = []
        price = []
        if len(tag) > 1:
            old_price = [basic.clean_string(value[0])]
        try:
            discount = [basic.clean_string(value[len(value) - 1])]
        except IndexError:
            print "This product has no price."
        try:
            price = hxs.select('//span[@class="topAlignedPrice"]/text()').extract()
        except IndexError:
            print "This product has no price."
        if not old_price and not discount and not price:
            price = hxs.select('//dl[@class="inlineList"]/dd/text()').extract()
        return self.clean_price(old_price), self.clean_price(discount), self.clean_price(price)

    # returning json with image url and serial number of product image refers to
    def get_images(self, hxs):
        images = hxs.select('//ul[@id="prodDetailThumbs"]/li/a/@href').extract()
        tags = hxs.select('//ul[@id="prodDetailThumbs"]/li/@class').extract()
        images_list = []
        d = {}
        img = []
        for i in range(0, len(images)):
            d["image_url"] = self.get_server_path(images[i])
            img.append(images[i])
            if "site1sku" in tags[i]:
                d["product_serial"] = tags[i].replace("site1sku", "")
            else:
                d["product_serial"] = tags[i]
            images_list.append(basic.cdata(simplejson.dumps(d)))
        return images_list, img

    # function for getting serials and all information about them, currently returns field with jsons with all
    # information, can be modified to return dicts if needed for subproducts for those one day
    def get_serials(self, hxs):
        serials = hxs.select('//var[@class="hidden styleInfo"]/text()').extract()
        new = []
        for serial in serials:
            d = simplejson.loads(serial)
            new.append(basic.cdata(simplejson.dumps(d)))
        return new

    def get_server_path(self, url):
        # uncomment next line if you want to keep absolute image path from their site
        return url
        return IMAGES_STORE + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg"

    # function for getting gold coverage from the page which is actually additional warranty options
    def gold_coverage(self, hxs):
        ids = hxs.select('//div[@class="goldCoverage"]/input[@type="checkbox"]/@value').extract()
        labels = hxs.select('//div[@class="goldCoverage"]/label/text()').extract()
        d = {}
        new = []
        for i in range(0, len(ids)):
            d["id"] = ids[i]
            d["name"] = labels[i]
            new.append(basic.cdata(simplejson.dumps(d)))
        return new

    # function for getting availability
    def get_available(self, hxs):
        p = hxs.select('//var[@class="hidden availability"]/text()').extract()
        if p:
            if p[0] == "in_stock":
                p = [p[0].upper()]
        else:
            # for those that have color options and in stock status for each of those
            # put IN_STOCK for the product as it has no that option on the page
            p = ["IN_STOCK"]
        return p

    # function for getting add to cart id and product reference
    def get_add_to_cart(self, hxs):
        try:
            temp = hxs.select('//span[@class="magicLink addToList"]/@data-rel').extract()[0]
        except:
            print "Product not available"
        else:
            return [temp.split("|")[0]], [temp.split("|")[1]]
        return [], []

    # function for gatting shipping information
    def get_shipping(self, hxs):
        return hxs.select('//div[@id="targeter_pdpShipping"]/span/text()').extract()

    # function for getting colors, return jsons with all the data about options
    def get_colors(self, hxs):
        colors = hxs.select('//var[@class="styleInfo"]/text()').extract()
        new = []
        for color in colors:
            d = simplejson.loads(color)
            new.append(basic.cdata(simplejson.dumps(d)))
        return new

    # cleaning price to leave only numbers
    def clean_price(self, price):
        new = []
        for i in price:
            new.append(re.sub("[^0-9.]", "", i))
        return new

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}".format(datetime.now())
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total)
        # filename for writing xml
        if self.d["database"]:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d["catalog_id"])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d["file"]
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        if self.d["upload"]:
            exp = CommonExport()
            try:
                exp.xml_to_db(self.name, filename, "4a9f5955-9b8e-4e13-84ef-95f937dbc00d")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        ## part for exporting to database here
        from modules.mail import Mail

        mail = Mail()
        try:
            mail.send_mail(msg, "GuitarCenter: {0}".format(filename))
            if self.d["email"]:
                mail.send_mail(msg, "GuitarCenter: {0}".format(filename), self.d["email"])
        except:
            msg += "\nSending mail failed."
        if self.d["database"]:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), "w") as f:
                f.write(msg)

    def add_properties(self, xml):
        xml.add_property("old_price", "Old Price", "decimal")
        xml.add_property("image_json", "Image Json", "text_list")
        xml.add_property("discount", "Discount", "decimal")
        xml.add_property("product_ref", "Product Ref.", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("serial", "Serial", "text_list")
        xml.add_property("colors", "Colors", "text_list")
        xml.add_property("add_to_cart_id", "Add To Cart ID", "text")
        xml.add_property("shipping", "Shipping", "text")
        xml.add_property("warranty", "Warranty", "text_list")
        xml.add_property("heading", "Heading", "text")
        xml.add_property("details", "Details", "text")
        xml.add_property("specs", "Specs", "text")
        xml.add_property("call_to_action", "Call To Action", "text")
        xml.add_property("brand_image", "Brand Image", "text")
        xml.add_property("brand_image_promo", "Brand Image Promo", "text")

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d["file"]))
        self.products = dict()
        try:
            self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15)
            self.products["product_ids"] = xls.read_excel_collumn_for_ids(1, 15)
            self.products["names"] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d["file"])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d["file"])
            self.exc.code_handler(103, msg=msg)
        self.products = xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

Exemple #11

0

Afficher le fichier

Fichier : sportman_spider.py Projet : marjevtic/testMarko

class SportmanSpider(CrawlSpider):
    name = "sportman"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(SportmanSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5, "Sportmann")

        if self.d["database"]:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d["catalog_id"], self.d["product_id"])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.start_urls = self.products["urls"]
        self.images_store = "/" + settings["IMAGES_STORE"]
        self.total = len(self.start_urls)

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = SportmanItem()
        if "redirect_urls" in response.request.meta:
            cur_url = response.request.meta["redirect_urls"][0]
        else:
            cur_url = response.url
        index = self.products["urls"].index(cur_url)
        try:
            if "redirect_urls" in response.request.meta:
                item["product_id"] = [self.products["product_ids"][index]]
                item["name"] = [self.products["names"][index]]
                item["in_stock"] = ["NOT_AVAILABLE"]
                self.exc.code_handler(102, response.url)
                self.xml.create_xml(item)
                self.products["status"][index] = "no_avail"
            else:
                item["name"], item["short_desc"], item["description"], item["old_price"], item["custom_price"], item[
                    "product_id"
                ], item["sku"] = self.get_basic_info(hxs)
                item["in_stock"] = ["IN_STOCK"]
                viewstate, eventval, prevpage, hidden, view_page, even_page, pre_page, hidd_page = self.get_vars(
                    response, hxs
                )

                viewstate1 = viewstate[:2000]
                viewstate2 = viewstate[2000:4000]
                viewstate3 = viewstate[4000:6000]
                viewstate4 = viewstate[6000:8000]
                viewstate5 = viewstate[8000:10000]
                viewstate6 = viewstate[10000:]

                item["viewstate1"] = [basic.cdata(viewstate1)]
                item["viewstate2"] = [basic.cdata(viewstate2)]
                item["viewstate3"] = [basic.cdata(viewstate3)]
                item["viewstate4"] = [basic.cdata(viewstate4)]
                item["viewstate5"] = [basic.cdata(viewstate5)]
                item["viewstate6"] = [basic.cdata(viewstate6)]
                item["eventval"] = [basic.cdata(eventval)]
                item["size_options"] = self.get_variants(hxs, response)

                images_url = self.get_images(hxs)

                item["normal_image_url"] = self.get_server_path(images_url)

                self.xml.create_xml(item)
                item.clear()
                item["image_urls"] = self.get_images(hxs)
                self.products["status"][index] = "ran"
        except:
            self.exc.code_handler(100, response.url)
            self.products["status"][index] = "error"
        else:
            return item

    def get_basic_info(self, hxs):
        name = hxs.select('//div[@id="fragment-1"]/h2/text()').extract()

        short_desc = hxs.select('//div[@class="description2"]/text()').extract()

        description = hxs.select('//div[@id="fragment-1"]/div[@class="description"]').extract()
        description = sportman.delete_tags(re, description[0])
        description = [basic.cdata(description)]

        old_price = hxs.select('//span[@class="oldprice"]/text()').extract()
        if old_price != []:
            old_price = " ".join(old_price)
            old_price = old_price.split(":")
            old_price = old_price[1].replace("Kr", "")
            old_price = [old_price.replace(" ", "")]
        else:
            old_price = old_price

        price = hxs.select('//span[@class="nowprice"]/text()').extract()
        if price != []:
            price = " ".join(price)
            price = price.split(":")
            price = price[1].replace("Kr", "")
            price = [price.replace(" ", "")]
        else:
            price = hxs.select('//span[@class="normalprice"]/text()').extract()
            price = " ".join(price)
            price = price.split(":")
            price = price[1].replace("Kr", "")
            price = [price.replace(" ", "")]

        id = hxs.select('//div[@class="articlenumber"]').extract()
        id = " ".join(id)
        id = id.replace(u"\xa0", "")
        id = basic.get_middle_text(id, "Art.nr.", "</div>")
        sku = id
        id = [id[0]]

        return name, short_desc, description, old_price, price, id, sku

    def get_vars(self, response, hxs):
        headers1 = {
            "User-Agent": "Mozilla/5.0 (Windows NT 5.1; rv:13.0) Gecko/20100101 Firefox/13.0.1",
            "Host": "www.sportmann.no",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-us,en;q=0.5",
            "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7",
            "Connection": "keep-alive",
            "Referer": "/product.aspx?productid=613232",
            "Cookie": "ASP.NET_SessionId=lurvsvrn3jxsfd45cedmsv45; Besok=922884e3-e9cb-4b69-b8c8-215f3cc988a9; __utma=184084580.1353376623.1312483243.1312483243.1312483243.1; __utmb=184084580.9.10.1312483243; __utmc=184084580; __utmz=184084580.1312483243.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)",
        }

        page = hxs.select("//html").extract()
        page = " ".join(page)

        viewst = basic.get_middle_text(page, 'id="__VIEWSTATE" value="', '"')
        eventval = basic.get_middle_text(page, 'id="__EVENTVALIDATION" value="', '"')
        prevpage = [""]
        hidden_field = [""]

        r = requests.get(response.url, headers=headers1)

        page_one = r.content

        viewst_page = basic.get_middle_text(page_one, 'id="__VIEWSTATE" value="', '"')
        eventval_page = basic.get_middle_text(page_one, 'id="__EVENTVALIDATION" value="', '"')
        prevpage_page = basic.get_middle_text(page_one, 'id="__PREVIOUSPAGE" value="', '"')
        hidden_temp = page_one.split('id="__VIEWSTATE"')
        hidden_temp = hidden_temp[1].split('id="__PREVIOUSPAGE"')
        hidden_temp = hidden_temp[0].split("<script sr")

        val_x = len(hidden_temp) - 1

        hidden_temp = basic.get_middle_text(hidden_temp[val_x], 'c="', '"')
        hidden_temp_val = hidden_temp[0]
        hidden_temp_val = hidden_temp_val.replace("amp;", "")
        hidden_url = "http://www.sportmann.no" + hidden_temp_val

        request_hidden = urllib2.Request(hidden_url)
        response_hidden = urllib2.urlopen(request_hidden)
        hidden_field_page = basic.get_middle_text(
            response_hidden.read(), "ctl00_ScriptManager1_HiddenField').value += '", "';"
        )

        return (
            viewst[0],
            eventval[0],
            prevpage[0],
            hidden_field[0],
            viewst_page[0],
            eventval_page[0],
            prevpage_page[0],
            hidden_field_page[0],
        )

    def get_variants(self, hxs, response):
        page = hxs.select("//html").extract()
        page = " ".join(page)
        dict_one = {}
        test_one = []

        temp = page.split('<div class="color">')
        temp = temp[1].split("</div>")
        temp = temp[0].split("<select name")

        viewstate, eventvalidation, previouspage, hiddenfield, view_page, even_page, pre_page, hidd_page = self.get_vars(
            response, hxs
        )

        if len(temp) == 1:
            color = hxs.select('//div[@class="color"]/text()').extract()
            value = hxs.select('//input[@id="ctl00_ContentPlaceHolder1_Variant1Hidden"]/@value').extract()
            color[0] = color[0].replace("  ", "")
            color = basic.clean_string(color[0])
            value = value[0]

        #            color = basic.clean_string(color[0])
        #            color = color.replace("  ","")
        #
        #            dict['color'] = color
        #            dict['color_value'] = value[0]

        else:
            test_color = basic.get_middle_text(temp[1], "farge</option>", "</select>")
            color = basic.get_middle_text(test_color[0], '">', "</option>")
            value = basic.get_middle_text(test_color[0], 'value="', '">')

            for i in range(0, len(color)):
                color[i] = color[i].replace("  ", "")
            #
            #                dict['color'] = color
            #                dict['color_value'] = value

        size_temp = page.split('<div class="size">')
        size_temp = size_temp[1].split("</div>")
        size_temp = size_temp[0].split("<select name")

        if len(size_temp) == 1:
            size = hxs.select('//div[@class="size"]/text()').extract()
            size = basic.clean_string(size[0])
            size = [size.replace("   ", "")]

            size_val = hxs.select('//input[@id="ctl00_ContentPlaceHolder1_Variant2Hidden"]/@value').extract()

            if size[0] == "":
                for i in range(len(value)):
                    resp_page = self.get_data(response, hidd_page, view_page, pre_page, even_page, value[i])

                    a_page = resp_page.split('<div class="siz')
                    a_page = a_page[1].split("</select>")

                    if len(a_page) == 1:

                        size = basic.get_middle_text(a_page[0], 'e">', '<input type="hidden"')
                        size_val = basic.get_middle_text(a_page[0], 'value="', '"')
                        size_val = size_val[0]
                        size_val = [size_val]

                    else:
                        a_page = basic.get_middle_text(a_page[0], "se</option>", "</select>")
                        size = basic.get_middle_text(a_page[0], '">', "</option>")
                        size_val = basic.get_middle_text(a_page[0], 'value="', '">')

                    dict_one["color"] = color[i]
                    dict_one["color_value"] = value[i]
                    dict_one["size_value"] = size_val

                    for x in range(0, len(size)):
                        size[x] = basic.clean_string(size[x])
                        size[x] = size[x].replace("   ", "")

                        dict_one["size"] = size

                    test_one.append(basic.cdata(json.dumps(dict_one)))

            else:
                dict_one["color"] = color

                dict_one["color_value"] = value
                dict_one["size"] = size
                dict_one["size_value"] = size_val
                test_one.append(basic.cdata(simplejson.dumps(dict_one)))

        else:
            test_size = basic.get_middle_text(size_temp[1], "se</option>", "</select>")
            size = basic.get_middle_text(test_size[0], '">', "</option>")
            size_val = basic.get_middle_text(test_size[0], 'value="', '">')

            for x in range(0, len(size)):
                size[x] = basic.clean_string(size[x])
                size[x] = size[x].replace("   ", "")

            dict_one["color"] = color
            dict_one["color_value"] = value
            dict_one["size"] = size
            dict_one["size_value"] = size_val

            test_one.append(basic.cdata(json.dumps(dict_one)))

        return test_one

    def get_server_path(self, url):
        images_array = []
        for i in range(0, len(url)):
            url[i] = basic.clean_string(url[i])

            images_array.append(self.images_store + "/full/" + hashlib.sha1(url[i]).hexdigest() + ".jpg")

        return images_array

    def get_images(self, hxs):
        page = hxs.select("//html").extract()
        page = " ".join(page)

        images = []

        temp = page.split('class="gallery_demo_unstyled"')
        temp = temp[1].split('<div class="right_container">')
        temp = basic.get_middle_text(temp[0], 'src="', '"')

        for i in range(0, len(temp)):
            image_url = "http://www.sportmann.no" + temp[i]
            images.append(image_url)

        return images

    def get_data(self, response, hidden, viewstate, previouspage, eventvalidation, colorvalue):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",
            "Host": "www.sportmann.no",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-us,en;q=0.5",
            "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7",
            "Connection": "keep-alive",
            "Referer": "http://www.sportmann.no/product.aspx?productid=613232",
            "Cookie": "",
        }

        eventvalidation = urllib.urlencode({"__EVENTVALIDATION": eventvalidation})
        viewstate = urllib.urlencode({"__VIEWSTATE": viewstate})
        previouspage = urllib.urlencode({"__PREVIOUSPAGE": previouspage})
        hidden = urllib.urlencode({"ctl00_ScriptManager1_HiddenField": hidden})

        data = (
            "ctl00%24ScriptManager1=ctl00%24ContentPlaceHolder1%24dropdownPanel%7Cctl00%24ContentPlaceHolder1%24ddlVariant&"
            + hidden
            + "%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0&__EVENTTARGET=ctl00%24ContentPlaceHolder1%24ddlVariant&__EVENTARGUMENT=&__LASTFOCUS=&"
            + viewstate
            + "&"
            + previouspage
            + "&"
            + eventvalidation
            + "&ctl00%24ProductSearch%24txtProdSearch=&ctl00%24ProductSearch%24TextBoxWatermarkProdSearch_ClientState=&ctl00%24ContentPlaceHolder1%24ddlVariant="
            + colorvalue
            + "&ctl00%24ContentPlaceHolder1%24Variant1Hidden=&ctl00%24ContentPlaceHolder1%24Variant2Hidden=&ctl00%24ContentPlaceHolder1%24tbAmount=1&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtFriendsName=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceFriendsName_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtFriendsEmail=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceFriendsEmail_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtYourName=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceYourName_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtYourEmail=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceYourEmail_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtComment=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceComment_ClientState=&__ASYNCPOST=true&"
        )

        # r = requests.get(response.url, h)
        req = urllib2.Request(response.url, data, headers)

        resp_page = urllib2.urlopen(req).read()

        return resp_page

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}".format(datetime.now())
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total)
        # filename for writing xml
        if self.d["database"]:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d["catalog_id"])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d["file"]
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        if self.d["upload"]:
            exp = CommonExport()
            try:
                exp.xml_to_db(self.name, filename, "1ccd39a5-af4e-47cc-aebe-e0dede5b14d8")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        from modules.mail import Mail

        mail = Mail()
        try:
            mail.send_mail(msg, "Sportmann: {0}".format(filename))
            if self.d["email"]:
                mail.send_mail(msg, "Sportmann: {0}".format(filename), self.d["email"])
        except:
            msg += "\nSending mail failed."
        if self.d["database"]:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), "w") as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d["file"]))
        self.products = dict()
        try:
            self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15)
            self.products["product_ids"] = xls.read_excel_collumn_for_ids(1, 15)
            self.products["names"] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d["file"])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d["file"])
            self.exc.code_handler(103, msg=msg)
        self.products = xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

    def add_properties(self, xml):
        xml.add_property("short_desc", "Short Description", "text")
        xml.add_property("old_price", "Old Price", "text")
        xml.add_property("custom_price", "New Price", "text")
        xml.add_property("color_value", "Color Value", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("size_val", "Size Value", "text_list")
        xml.add_property("sku", "Sku", "text")
        xml.add_property("size_options", "Size_options", "text_list")
        xml.add_property("viewstate1", "Viewstate1", "text_list")
        xml.add_property("viewstate2", "Viewstate2", "text_list")
        xml.add_property("viewstate3", "Viewstate3", "text_list")
        xml.add_property("viewstate4", "Viewstate4", "text_list")
        xml.add_property("viewstate5", "Viewstate5", "text_list")
        xml.add_property("viewstate6", "Viewstate6", "text_list")
        xml.add_property("eventval", "Eventval", "text_list")
        xml.add_property("hidden", "Hidden Field", "text_list")
        xml.add_property("prevpage", "Previous Page", "text_list")
        xml.add_property("recommended_product", "Recommended Product", "text_list")

Exemple #12

0

Afficher le fichier

Fichier : burton_spider.py Projet : marjevtic/testMarko

class BurtonSpider(CrawlSpider):
    name = "burton"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(BurtonSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5, "Burton")
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                        self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.handle_not_provided()
        burton.add_properties(self.xml)
        self.start_urls = self.products['urls']
        self.start_urls = ["http://www.dickssportinggoods.com/product/index.jsp?productId=13243074"]
        self.images_store = "/" + settings['IMAGES_STORE']
        self.total = len(self.start_urls)

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = BurtonItem()
        page = hxs.extract()
        if 'redirect_urls' in response.request.meta:
            cur_url = response.request.meta['redirect_urls'][0]
        else:
            cur_url = response.url
        index = self.products['urls'].index(cur_url)
        try:
            if 'redirect_urls' in response.request.meta:
                item['product_id'] = [self.products['product_ids'][index]]
                item['name'] = [self.products['names'][index]]
                item['in_stock'] = ["NOT_AVAILABLE"]
                self.exc.code_handler(102, response.url)
                self.xml.create_xml(item)
                self.products["status"][index] = "no_avail"
            else:
                item['product_id'], item['name'] = self.get_basic_info(hxs)
                item['description'], item['features'] = self.get_description(hxs)
                item['variants'], thumb_urls, color_names = self.get_variants(page)
                item['all_sizes'] = self.get_all_sizes(page)
                item['color_json'], image_urls = self.get_colors(page, color_names)
                item['price'], item['old_price'] = self.get_prices(hxs)
                item['in_stock'] = ['IN_STOCK']
                item['product_link'] = [basic.cdata(response.url)]
                self.xml.create_xml(item)
                item['image_urls'] = image_urls + thumb_urls
                self.products["status"][index] = "ran"
        except:
            self.exc.code_handler(100, response.url)
            self.products["status"][index] = "error"
        else:
            return item

    def handle_not_provided(self):
        item = BurtonItem()
        for n in self.no_urls['product_ids']:
            item['product_id'] = [n]
            index = self.no_urls['product_ids'].index(n)
            item['name'] = [self.no_urls['names'][index]]
            item['in_stock'] = ['NOT_AVAILABLE']
            self.xml.create_xml(item)

    def get_basic_info(self, hxs):
        name = hxs.select('//h1[@class="productHeading"]/text()').extract()
        product_id = hxs.select('//input[@name="productId"]/@value').extract()
        return product_id, name

    def get_server_path(self, url):
        path = self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg"
        return path

    def get_prices(self, hxs):
        price = hxs.select('//div[@class="op"]/text()').extract()
        price = [basic.get_price(price[0])]
        old_price = hxs.select('//span[@class="lp"]/text()').extract()
        if old_price:
            old_price = [basic.get_price(old_price[0])]
        return price, old_price

    def get_description(self, hxs):
        description = hxs.select('//div[@id="FieldsetProductInfo"]/text()').extract()[3]
        features = hxs.select('//div[@id="FieldsetProductInfo"]/ul').extract()
        if features:
            features = [features[0][:2000]]
        return [basic.cdata(description)], basic.cdata_field(features)

    def get_variants(self, page):
        """Gets jsons for colors with all available sizes.
        In json are also fetched all information for sizes that are on the site
        """
        script = basic.get_middle_text(page, 'var skuSizeColorObj = new Array();', '</script>')[0]
        sizes = []
        image_urls = []
        color_names = []
        colors = script.split('skuSizeColorObj')
        for c in range(1, len(colors)):
            temp = basic.get_middle_text(colors[c], '= ', ';')
            # delete swatch image as it obviously won't be needed
            t = simplejson.loads(burton.replace_for_json(temp[0]))
            image_urls.append(t['swatchURL'])
            color_names.append(t['ColorDesc'])
            t['swatchURL'] = self.get_server_path(t['swatchURL'])
            sizes.append(basic.cdata(simplejson.dumps(t)))
        return sizes, image_urls, color_names

    def get_all_sizes(self, page):
        script = basic.get_middle_text(page, 'var distsizeobj=new Array();', 'var indexcolor=0;')[0]
        all_sizes = basic.get_middle_text(script, ']="','";')
        return [basic.cdata(simplejson.dumps(all_sizes))]

    def get_colors(self, page, color_names):
        """Gets color information with images from javascript on the page.
        Returns  json with color name and imagself.images_store = "/" + settings['IMAGES_STORE']e url for that color, and
        returnes filed of image urls that can be used for download later"""
        script = basic.get_middle_text(page, 'var imageMap_0 = new Array();', '</script>')[0]
        colors = basic.get_middle_text(script, '] = ', ';')
        image_urls = []
        colors_json = []
        for i in range(0, len(color_names)):
            color = burton.replace_color_json(colors[i])
            color = simplejson.loads(color)
            color['cname'] = color_names[i]
            color.pop('reg')
            image_urls.append(color['enh'])
            color['enh'] = self.get_server_path(color['enh'])
            colors_json.append(basic.cdata(simplejson.dumps(color)))
        return colors_json, image_urls

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}".format(datetime.now())
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        if self.d['upload']:
            exp = CommonExport()
            try:
                exp.xml_to_db(self.name, filename, "4ea95a81-90fb-49e2-837e-acf5ab58f574")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        # part for exporting to database here
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "Burton: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "Burton: {0}".format(filename), self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15)
            self.products["product_ids"] = xls.read_excel_collumn_for_ids(1, 15)
            self.products["names"] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        self.products= xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

Exemple #13

0

Afficher le fichier

class GuitarCenterSpider(CrawlSpider):
    name = "guitar_center"
    allowed_domains = ["musiciansfriend.com"]
    start_urls = ["http://www.musiciansfriend.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(GuitarCenterSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5)
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                        self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.handle_not_provided()
        self.start_urls = self.products['urls']
        self.total = len(self.products['urls'])

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = GuitarCenterItem()
        from scrapy.conf import settings
        if 'redirect_urls' in response.request.meta:
            cur_url = response.request.meta['redirect_urls'][0]
        else:
            cur_url = response.url
        index = self.products['urls'].index(cur_url)
        try:
            item['product_id'] = [self.products['product_ids'][index]]
            item['name'], item['brand'] = self.get_basic_info(hxs)
            item['heading'], item['details'], item['specs'], item['call_to_action'] = self.get_description(hxs)
            item['brand_image'], item['brand_image_promo'], brand_images = self.get_description_images(hxs)
            item['old_price'], item['discount'], item['price'] = self.get_prices(hxs)
            item['image_json'], img = self.get_images(hxs)
            item['serial'] = self.get_serials(hxs)
            item['warranty'] = self.gold_coverage(hxs)
            item['in_stock'] = self.get_available(hxs)
            item['product_ref'], item['add_to_cart_id'] = self.get_add_to_cart(hxs)
            if not item['add_to_cart_id']:
                item['in_stock'] = ["NOT_AVAILABLE"]
            item['shipping'] = self.get_shipping(hxs)
            item['colors'] = self.get_colors(hxs)
            self.products['status'][index] = "ran"
        except StandardError:
            self.products['status'][index] = "error"
            self.exc.code_handler(100, response.url)
        else:
            self.xml.create_xml(item)
            item['image_urls'] = img + brand_images
        return item

    def handle_not_provided(self):
        item = GuitarCenterItem()
        for n in self.no_urls['product_ids']:
            item['product_id'] = [n]
            index = self.no_urls['product_ids'].index(n)
            item['name'] = [self.no_urls['names'][index]]
            item['in_stock'] = ['NOT_AVAILABLE']
            self.xml.create_xml(item)

    def get_basic_info(self, hxs):
        name = hxs.select('//h1[@class="fn"]/text()').extract()
        name = [basic.clean_string("".join(name))]
        brand = hxs.select('//span[@class="brand"]/text()').extract()
        name = [name[0].replace(u"\xa0", "")]
        return name, brand

    def get_description_images(self, hxs):
        brand_image = hxs.select('//a[@class="brandImage"]/img/@src').extract()
        brand_image_promo = hxs.select('//div[@class="brandPromoLogo"]/img/@src').extract()
        images = brand_image + brand_image_promo
        if brand_image:
            brand_image = [self.get_server_path(brand_image[0])]
        if brand_image_promo:
            brand_image_promo = [self.get_server_path(brand_image_promo[0])]
        return brand_image, brand_image_promo, images

    def get_description(self, hxs):
        heading = hxs.select('//div[@id="description"]/p').extract()
        details = hxs.select('//p[@class="description"]').extract()
        specs = hxs.select('//div[@class="specs"]/ul').extract()
        last = hxs.select('//div[@class="callToAction"]/p/text()').extract()
        return basic.cdata_field(heading), basic.cdata_field(details), basic.cdata_field(specs), basic.cdata_field(last)

    #function for getting prices, returns tags and values or empty field if no option for one of them new is discount
    def get_prices(self, hxs):
        tag = hxs.select('//dl[@class="lineItemList"]/dt/text()').extract()
        value = hxs.select('//dl[@class="lineItemList"]/dd/text()').extract()
        old_price = []
        discount = []
        price = []
        if len(tag) > 1:
            old_price = [basic.clean_string(value[0])]
        try:
            discount = [basic.clean_string(value[len(value) - 1])]
        except IndexError:
            print "This product has no price."
        try:
            price = hxs.select('//span[@class="topAlignedPrice"]/text()').extract()
        except IndexError:
            print "This product has no price."
        if not old_price and not discount and not price:
            price = hxs.select('//dl[@class="inlineList"]/dd/text()').extract()
        return self.clean_price(old_price), self.clean_price(discount), self.clean_price(price)

    # returning json with image url and serial number of product image refers to
    def get_images(self, hxs):
        images = hxs.select('//ul[@id="prodDetailThumbs"]/li/a/@href').extract()
        tags = hxs.select('//ul[@id="prodDetailThumbs"]/li/@class').extract()
        images_list = []
        d = {}
        img = []
        for i in range(0, len(images)):
            d['image_url'] = self.get_server_path(images[i])
            img.append(images[i])
            if "site1sku" in tags[i]:
                d['product_serial'] = tags[i].replace("site1sku", "")
            else:
                d['product_serial'] = tags[i]
            images_list.append(basic.cdata(simplejson.dumps(d)))
        return images_list, img

    # function for getting serials and all information about them, currently returns field with jsons with all
    # information, can be modified to return dicts if needed for subproducts for those one day
    def get_serials(self, hxs):
        serials = hxs.select('//var[@class="hidden styleInfo"]/text()').extract()
        new = []
        for serial in serials:
            d = simplejson.loads(serial)
            new.append(basic.cdata(simplejson.dumps(d)))
        return new

    def get_server_path(self, url):
        #uncomment next line if you want to keep absolute image path from their site
        return url
        return IMAGES_STORE + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg"

    # function for getting gold coverage from the page which is actually additional warranty options
    def gold_coverage(self, hxs):
        ids = hxs.select('//div[@class="goldCoverage"]/input[@type="checkbox"]/@value').extract()
        labels = hxs.select('//div[@class="goldCoverage"]/label/text()').extract()
        d = {}
        new = []
        for i in range(0, len(ids)):
            d['id'] = ids[i]
            d['name'] = labels[i]
            new.append(basic.cdata(simplejson.dumps(d)))
        return new

    # function for getting availability
    def get_available(self, hxs):
        p = hxs.select('//var[@class="hidden availability"]/text()').extract()
        if p:
            if p[0] == "in_stock":
                p = [p[0].upper()]
        else:
            #for those that have color options and in stock status for each of those
            #put IN_STOCK for the product as it has no that option on the page
            p = ["IN_STOCK"]
        return p

    # function for getting add to cart id and product reference
    def get_add_to_cart(self, hxs):
        try:
            temp = hxs.select('//span[@class="magicLink addToList"]/@data-rel').extract()[0]
        except:
            print "Product not available"
        else:
            return [temp.split("|")[0]], [temp.split("|")[1]]
        return [], []

    # function for gatting shipping information
    def get_shipping(self, hxs):
        return hxs.select('//div[@id="targeter_pdpShipping"]/span/text()').extract()

    # function for getting colors, return jsons with all the data about options
    def get_colors(self, hxs):
        colors = hxs.select('//var[@class="styleInfo"]/text()').extract()
        new = []
        for color in colors:
            d = simplejson.loads(color)
            new.append(basic.cdata(simplejson.dumps(d)))
        return new

    # cleaning price to leave only numbers
    def clean_price(self, price):
        new = []
        for i in price:
            new.append(re.sub('[^0-9.]', '', i))
        return new

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}".format(datetime.now())
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        if self.d['upload']:
            exp = CommonExport()
            try:
                exp.xml_to_db(self.name, filename, "4a9f5955-9b8e-4e13-84ef-95f937dbc00d")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        ## part for exporting to database here
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "GuitarCenter: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "GuitarCenter: {0}".format(filename), self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def add_properties(self, xml):
        xml.add_property("old_price", "Old Price", "decimal")
        xml.add_property("image_json", "Image Json", "text_list")
        xml.add_property("discount", "Discount", "decimal")
        xml.add_property("product_ref", "Product Ref.", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("serial", "Serial", "text_list")
        xml.add_property("colors", "Colors", "text_list")
        xml.add_property("add_to_cart_id", "Add To Cart ID", "text")
        xml.add_property("shipping", "Shipping", "text")
        xml.add_property("warranty", "Warranty", "text_list")
        xml.add_property("heading", "Heading", "text")
        xml.add_property("details", "Details", "text")
        xml.add_property("specs", "Specs", "text")
        xml.add_property("call_to_action", "Call To Action", "text")
        xml.add_property("brand_image", "Brand Image", "text")
        xml.add_property("brand_image_promo", "Brand Image Promo", "text")

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15)
            self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15)
            self.products['names'] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        self.products= xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

Exemple #14

0

Afficher le fichier

Fichier : burton_spider.py Projet : marjevtic/testMarko

class BurtonSpider(CrawlSpider):
    name = "burton"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(BurtonSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5, "Burton")
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(
                self.d['catalog_id'], self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.handle_not_provided()
        burton.add_properties(self.xml)
        self.start_urls = self.products['urls']
        self.start_urls = [
            "http://www.dickssportinggoods.com/product/index.jsp?productId=13243074"
        ]
        self.images_store = "/" + settings['IMAGES_STORE']
        self.total = len(self.start_urls)

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = BurtonItem()
        page = hxs.extract()
        if 'redirect_urls' in response.request.meta:
            cur_url = response.request.meta['redirect_urls'][0]
        else:
            cur_url = response.url
        index = self.products['urls'].index(cur_url)
        try:
            if 'redirect_urls' in response.request.meta:
                item['product_id'] = [self.products['product_ids'][index]]
                item['name'] = [self.products['names'][index]]
                item['in_stock'] = ["NOT_AVAILABLE"]
                self.exc.code_handler(102, response.url)
                self.xml.create_xml(item)
                self.products["status"][index] = "no_avail"
            else:
                item['product_id'], item['name'] = self.get_basic_info(hxs)
                item['description'], item['features'] = self.get_description(
                    hxs)
                item['variants'], thumb_urls, color_names = self.get_variants(
                    page)
                item['all_sizes'] = self.get_all_sizes(page)
                item['color_json'], image_urls = self.get_colors(
                    page, color_names)
                item['price'], item['old_price'] = self.get_prices(hxs)
                item['in_stock'] = ['IN_STOCK']
                item['product_link'] = [basic.cdata(response.url)]
                self.xml.create_xml(item)
                item['image_urls'] = image_urls + thumb_urls
                self.products["status"][index] = "ran"
        except:
            self.exc.code_handler(100, response.url)
            self.products["status"][index] = "error"
        else:
            return item

    def handle_not_provided(self):
        item = BurtonItem()
        for n in self.no_urls['product_ids']:
            item['product_id'] = [n]
            index = self.no_urls['product_ids'].index(n)
            item['name'] = [self.no_urls['names'][index]]
            item['in_stock'] = ['NOT_AVAILABLE']
            self.xml.create_xml(item)

    def get_basic_info(self, hxs):
        name = hxs.select('//h1[@class="productHeading"]/text()').extract()
        product_id = hxs.select('//input[@name="productId"]/@value').extract()
        return product_id, name

    def get_server_path(self, url):
        path = self.images_store + "/full/" + hashlib.sha1(
            url).hexdigest() + ".jpg"
        return path

    def get_prices(self, hxs):
        price = hxs.select('//div[@class="op"]/text()').extract()
        price = [basic.get_price(price[0])]
        old_price = hxs.select('//span[@class="lp"]/text()').extract()
        if old_price:
            old_price = [basic.get_price(old_price[0])]
        return price, old_price

    def get_description(self, hxs):
        description = hxs.select(
            '//div[@id="FieldsetProductInfo"]/text()').extract()[3]
        features = hxs.select('//div[@id="FieldsetProductInfo"]/ul').extract()
        if features:
            features = [features[0][:2000]]
        return [basic.cdata(description)], basic.cdata_field(features)

    def get_variants(self, page):
        """Gets jsons for colors with all available sizes.
        In json are also fetched all information for sizes that are on the site
        """
        script = basic.get_middle_text(page,
                                       'var skuSizeColorObj = new Array();',
                                       '</script>')[0]
        sizes = []
        image_urls = []
        color_names = []
        colors = script.split('skuSizeColorObj')
        for c in range(1, len(colors)):
            temp = basic.get_middle_text(colors[c], '= ', ';')
            # delete swatch image as it obviously won't be needed
            t = simplejson.loads(burton.replace_for_json(temp[0]))
            image_urls.append(t['swatchURL'])
            color_names.append(t['ColorDesc'])
            t['swatchURL'] = self.get_server_path(t['swatchURL'])
            sizes.append(basic.cdata(simplejson.dumps(t)))
        return sizes, image_urls, color_names

    def get_all_sizes(self, page):
        script = basic.get_middle_text(page, 'var distsizeobj=new Array();',
                                       'var indexcolor=0;')[0]
        all_sizes = basic.get_middle_text(script, ']="', '";')
        return [basic.cdata(simplejson.dumps(all_sizes))]

    def get_colors(self, page, color_names):
        """Gets color information with images from javascript on the page.
        Returns  json with color name and imagself.images_store = "/" + settings['IMAGES_STORE']e url for that color, and
        returnes filed of image urls that can be used for download later"""
        script = basic.get_middle_text(page, 'var imageMap_0 = new Array();',
                                       '</script>')[0]
        colors = basic.get_middle_text(script, '] = ', ';')
        image_urls = []
        colors_json = []
        for i in range(0, len(color_names)):
            color = burton.replace_color_json(colors[i])
            color = simplejson.loads(color)
            color['cname'] = color_names[i]
            color.pop('reg')
            image_urls.append(color['enh'])
            color['enh'] = self.get_server_path(color['enh'])
            colors_json.append(basic.cdata(simplejson.dumps(color)))
        return colors_json, image_urls

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}".format(datetime.now())
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter,
                                                         self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        if self.d['upload']:
            exp = CommonExport()
            try:
                exp.xml_to_db(self.name, filename,
                              "4ea95a81-90fb-49e2-837e-acf5ab58f574")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        # part for exporting to database here
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "Burton: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "Burton: {0}".format(filename),
                               self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15)
            self.products["product_ids"] = xls.read_excel_collumn_for_ids(
                1, 15)
            self.products["names"] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        self.products = xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

Exemple #15

0

Afficher le fichier

class BootsSpider(CrawlSpider):
    name = "boots"
    allowed_domains = ["zmags.com"]
    start_urls = ["http://www.zmags.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(BootsSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5)
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                        self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.handle_not_provided()
        self.start_urls = self.products['urls']
        self.total = len(self.products['urls'])

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = BootsItem()
        item['product_id'], item['store_id'], item['lang_id'], item['catalog_id'] = self.get_ids(hxs)
        item['name'] = self.get_name(hxs)
        item['short_description'], sponsored, description, in_stock, item['ingredients'], patient_information_url, item['offer'], item['promotion'] = self.get_description(hxs)
        item['rating'] = self.get_rating(hxs)
        size, price_per_size = self.get_size(hxs)
        item['normal_image_url'], image_urls = self.get_images(hxs)
        brand, brand_image_url = self.get_brand(hxs)
        item['save_money'], item['old_price'] = self.get_oldies(hxs)
        for i in range(0, len(description)):
            tag = 'description_%d' % (i + 1)
            item[tag] = [basic.cdata(description[i])]
        if sponsored is not None:
            item['sponsored'] = sponsored
        item['in_stock'] = ["NOT_IN_STOCK"]
        if in_stock == "In stock":
            item['in_stock'] = ["IN_STOCK"]
            item['order_id'] = hxs.select('//input[@name="orderId"]/@value').extract()
            item['cat_entry_id'] = hxs.select('//input[@name="catEntryId"]/@value').extract()
            item['calculation_usage_id'] = hxs.select('//input[@name="calculationUsageId"]/@value').extract()
        if brand_image_url is not None:
            item['brand'] = brand
            item['brand_image_url'] = ["43662980-f344-11e1-a21f-0800200c9a66/full/" + self.get_image_sha1(brand_image_url)]
            image_urls.append(brand_image_url)
        if patient_information_url is not None:
            item['patient_information_url'] = [basic.cdata(patient_information_url)]
        prices, point_prices, collect_points, colors, color_image_urls, variant_ids = self.get_color_variants(hxs)
        if size is not None:
            item['size'] = size
            item['price_per_size'] = price_per_size
        elif variant_ids is None:
            prices, point_prices, collect_points, sizes, variant_ids = self.get_size_variants(hxs)
        if color_image_urls is not None:
            image_urls.extend(color_image_urls)
        if variant_ids is not None:
            self.xml.create_xml(item)
            if colors is not None:
                self.create_color_variants(prices, point_prices, colors, color_image_urls, variant_ids, collect_points, item['product_id'])
            else:
                self.create_size_variants(prices, point_prices, sizes, variant_ids, collect_points, item['product_id'])
        else:
            prices = hxs.select('//p[@class="price"]/text()').extract()[0]
            point_prices = hxs.select('//span[@class="pointsPrice"]/text()').extract()[0]
            collect_points = [basic.get_price(hxs.select('//p[@class="collectPoints"]/text()').extract()[0])]
            item['price'] = [basic.get_price(prices)]
            item['points_price'] = [basic.get_price(point_prices)]
            item['collect_points'] = collect_points
            self.xml.create_xml(item)
        item['image_urls'] = image_urls
        #raw_input("Press Enter to continue...")
        return item

    def handle_not_provided(self):
        item = BootsItem()
        for n in self.no_urls['product_ids']:
            item['product_id'] = [n]
            index = self.no_urls['product_ids'].index(n)
            item['name'] = [self.no_urls['names'][index]]
            item['in_stock'] = ['NOT_AVAILABLE']
            self.xml.create_xml(item)

    def get_ids(self, hxs):
        product_id = hxs.select('//input[@name="productId"]/@value').extract()[0]
        store_id = hxs.select('//input[@name="storeId"]/@value').extract()[0]
        lang_id = hxs.select('//input[@name="langId"]/@value').extract()[0]
        catalog_id = hxs.select('//input[@name="catalogId"]/@value').extract()[0]
        return [product_id], [store_id], [lang_id], [catalog_id]
        
    def get_name(self, hxs):
        name = hxs.select('//span[@class="pd_productNameSpan"]/text()').extract()[0]
        return [name]
    
    def get_description(self, hxs):
        short_description = hxs.select('//div[@class="productIntroCopy"]').extract()[0]
        try:
            suitable_for = ''.join(hxs.select('//div[@id="suitableFor"]//h4 | //div[@id="suitableFor"]//p | //div[@id="suitableFor"]//div').extract())
            short_description += suitable_for
        except:
            print "There's no suitable_for section"
        try:
            ingredients = basic.clean_string(' '.join(hxs.select('//div[@class="pd_panel"][not(@id)]//div[@class="pd_HTML"]/p | //div[@class="pd_panel"][not(@id)]//div[@class="pd_HTML"]//div').extract()))
            if ingredients != '':
                ingredients = basic.cdata(ingredients)
        except:
            print "No ingredients found!"
            ingredients = None
        try:
            patient_information_url = hxs.select('//div[@class="downloadMedia"]//a/@href').extract()[0]
        except:
            print "No patient information found!"
            patient_information_url = None
        try:
            offer = hxs.select('//div[@id="mainOffer"]//a/text()').extract()[0]
        except:
            print "No special offer found!"
            offer = None
        try:
            promotion = hxs.select('//div[@id="otherOffers"]//a/text()').extract()
        except:
            print "No promotion found!"
            promotion = None
        try:
            sponsored = hxs.select('//div[@class="sponsored"]//p/text()').extract()[0]
        except:
            print "No sponsor message found!"
            sponsored = None
        description = ''.join(hxs.select('//div[@id="detailedInfo"]//div[@class="pd_panelInner"]//div[@class="pd_HTML"]').extract())
        description = basic.clean_string(description)
        description_overflow = len(description)/2000
        desc = []
        if description_overflow > 0:
            for i in range(0, description_overflow + 1):
                if i < description_overflow:
                    desc.append(description[2000*(i):2000*(i+1)-1])
                else:
                    desc.append(description[2000*i:])
        else:
            desc = [description]
        try:
            in_stock = hxs.select('//div[@class="icon_pl_stock"]/text()').extract()[0]
        except:
            in_stock = ""
        return [basic.cdata(basic.clean_string(short_description))], [sponsored], desc, in_stock, [ingredients], patient_information_url, [offer], promotion
    
    def get_images(self, hxs):
        image_urls = []
        normal_image_url = hxs.select('//meta[@property="og:image"]//@content').extract()[0]
        image_urls.append(normal_image_url)
        normal_image_url = "43662980-f344-11e1-a21f-0800200c9a66/full/" + self.get_image_sha1(normal_image_url)
        return [normal_image_url], image_urls
    
    def get_brand(self, hxs):
        try:
            brand = hxs.select('//div[@class="pd_brand"]//div//a//span//img/@alt').extract()[0]
            brand_image_url = hxs.select('//div[@class="pd_brand"]//div//a//span//img/@src').extract()[0]
            return [brand], brand_image_url
        except:
            print "No brand name or image found!"
            return None, None
    
    def get_rating(self, hxs):
        try:
            rating = hxs.select('//span[@property="v:average"]/text()').extract()[0]
        except:
            rating = "0.0"
        return [rating]
    
    def get_size(self, hxs):
        try:
            size = hxs.select('//span[@class="size"]/text()').extract()[0]
            size = basic.clean_string(size)
            size = size.replace("|", "")
            price_per_size = hxs.select('//span[@class="pricePerSize"]/text()').extract()[0]
            return [size], [price_per_size]
        except:
            print "No size found"
            return None, None
        
    def get_oldies(self, hxs):
        try:
            save = hxs.select('//span[@class="save"]/text()').extract()[0]
            old = hxs.select('//span[@class="oldPrice"]/text()').extract()[0]
            save = basic.get_price(save)
            old = basic.get_price(old)
        except:
            save = None
            old = None
        return [save], [old]
            
    def get_color_variants(self, hxs):
        try:
            variants = hxs.select('//script').re('productCode:\".*\d\"')[0].split(",")
            colors = hxs.select('//div[@class="gp_80-20a column"]//div[@class="innerColumn"]//fieldset//div//label//span/text()').extract()
            color_image_urls = hxs.select('//div[@class="gp_80-20a column"]//div[@class="innerColumn"]//fieldset//div//label//img//@src').extract()
            collect_points = []
            prices = []
            point_prices = []
            variant_ids = []
            for i in range(0, len(variants), 8):
                price = basic.get_price(variants[i+2])
                prices.append(price)
                points = str(int(float(price) * 100))
                point_prices.append(points)
                variant_id = basic.get_price(variants[i])
                variant_ids.append(variant_id)
                points = basic.get_price(variants[i+5])
                collect_points.append(points)
            return prices, point_prices, collect_points, colors, color_image_urls, variant_ids
        except:
            print "No color variants found"
            return None, None, None, None, None, None
            
    def get_size_variants(self, hxs):
        try:
            variants = hxs.select('//script').re('productCode:\".*\d\"')[0].split(",")
        except:
            print "No size variants found"
            return None, None, None, None, None
        sizes = hxs.select('//select[@id="size_x"]//option/text()').extract()[1:]
        collect_points = []
        prices = []
        point_prices = []
        variant_ids = []
        for i in range(7, len(variants), 7):
            price = basic.get_price(variants[i+2])
            prices.append(price)
            points = str(int(float(price) * 100))
            point_prices.append(points)
            variant_id = basic.get_price(variants[i+4])
            variant_ids.append(variant_id)
            points = basic.get_price(variants[i+1])
            collect_points.append(points)
        return prices, point_prices, collect_points, sizes, variant_ids
    
    def create_color_variants(self, prices, point_prices, colors, color_image_urls, variant_ids, collect_points, product_id):
        for i in range(0, len(colors)):
            variant = BootsItem()
            variant['master_product_id'] = product_id
            variant['product_id'] = [variant_ids[i]]
            variant['price'] = [prices[i]]
            variant['points_price'] = [point_prices[i]]
            variant['collect_points'] = [collect_points[0]]
            variant['color'] = [colors[i]]
            variant['color_image_url'] = ["43662980-f344-11e1-a21f-0800200c9a66/full/" + self.get_image_sha1(color_image_urls[i])]
            self.xml.create_xml(variant)
            
    def create_size_variants(self, prices, point_prices, sizes, variant_ids, collect_points, product_id):
        for i in range(0, len(sizes)):
            variant = BootsItem()
            variant['master_product_id'] = product_id
            variant['product_id'] = [variant_ids[i]]
            variant['price'] = [prices[i]]
            variant['points_price'] = [point_prices[i]]
            variant['collect_points'] = [collect_points[0]]
            variant['size'] = [sizes[i]]
            self.xml.create_xml(variant)
    
    def get_image_sha1(self, image_url):
        h = hashlib.sha1()
        h.update(image_url)
        return h.hexdigest()
    
    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}".format(datetime.now())
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total)
        # filename for writing xml"""
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        if self.d['upload']:
            exp = CommonExport()
            try:
                exp.xml_to_db(self.name, filename, "5097450b-2c49-49d4-b47a-55b1bc652c78")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        ## part for exporting to database here
        from modules.mail import Mail
        mail = Mail()
        """try:
            mail.send_mail(msg, "Boots: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "Boots: {0}".format(filename), self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)"""

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15)
            self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15)
            self.products['names'] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        self.products= xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

    def add_properties(self, xml):
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("store_id", "Store ID", "text")
        xml.add_property("lang_id", "Lang ID", "text")
        xml.add_property("catalog_id", "Catalog ID", "text")
        xml.add_property("order_id", "Order ID", "text")
        xml.add_property("cat_entry_id", "Cat Entry ID", "text")
        xml.add_property("calculation_usage_id", "Calculation Usage ID", "text")
        xml.add_property("ingredients", "Ingredients", "text")
        xml.add_property("patient_information_url", "Patient Information Url", "text")
        xml.add_property("points_price", "Points Price", "integer")
        xml.add_property("collect_points", "Collect Points", "integer")
        xml.add_property("brand_image_url", "Brand Image Url", "text")
        xml.add_property("description_1", "Description 1", "text")
        xml.add_property("description_2", "Description 2", "text")
        xml.add_property("description_3", "Description 3", "text")
        xml.add_property("description_4", "Description 4", "text")
        xml.add_property("description_5", "Description 5", "text")
        xml.add_property("description_6", "Description 6", "text")
        xml.add_property("sponsored", "Sponsored", "text")
        xml.add_property("offer", "Offer", "text")
        xml.add_property("promotion", "Promotion", "text")
        xml.add_property("old_price", "Old Price", "decimal")
        xml.add_property("save_money", "Save Money", "decimal")
        xml.add_property("price_per_size", "Price Per Size", "text")

Exemple #16

0

Afficher le fichier

Fichier : partylite_spider.py Projet : marjevtic/testMarko

class PartyliteSpider(CrawlSpider):
    name = "partylite"
    allowed_domains = ["partylite.biz"]
    start_urls = ["http://www.zmags.com"]
    counter = 0

    def __init__(self, *a, **kw):
        super(PartyliteSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = PartyliteTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.images_store = "/" + settings['IMAGES_STORE']
        self.users = party.get_users(settings, self.d)
        self.exc = ZmagsException(50)
        self.production = self.d['env']
        self.upload = self.d['upload']
        self.english = self.d['lang']
        self.file_name = self.d['file']
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                        self.d['product_id'])
            self.database.disconnect()
            self.change_url_list()
        else:
            self.get_lists_from_excel()
        self.xml = CommonXml()
        party.add_properties(self.xml)
        self.total = len(self.products['urls'])

    def parse(self, response):

        for url in self.products['urls']:

            if self.d['lang'] == 'us':
                request = Request(url, callback=self.parse_can, dont_filter=True)
                yield request

            elif self.d['lang'] == 'english':
                c_url = url.replace(self.users['us'], self.users['canada_en'])
                request = Request(c_url, callback=self.parse_can, dont_filter=True)
                request.meta['language'] = "eng"
                yield request

            elif self.d['lang'] == 'french':
                c_url = url.replace(self.users['us'], self.users['canada_fr'])
                request = Request(c_url, callback=self.parse_can, dont_filter=True)
                request.meta['language'] = "fr"
                yield request

    def change_url_list(self):
        for i in range(0, len(self.products['urls'])):
            if not self.production:
                self.products['urls'][i] = self.products['urls'][i].replace('www', 'qa')
            self.products['urls'][i] = self.products['urls'][i].replace('XXXXX', self.users['us'])

    def get_in_stock(self, hxs):
        """Gets in stock information about product."""
        stock = hxs.select('//div[@id="availability_container"]').extract()
        if not stock:
            return ["IN_STOCK"]
        else:
            return ["NOT_IN_STOCK"]

    def get_basic_info(self, hxs):
        """Getting basic info about products (name, shown with)."""
        name = hxs.select('//div[@id="product_name"]/text()').extract()
        if name:
            name = basic.cdata_field(name)
        shown_with = hxs.select('//div[@id="shown_with_container"]').extract()
        if shown_with:
            shown_with = [basic.cdata(shown_with[0])]
        return name, shown_with

    def get_description(self, hxs):
        description = description = hxs.select('//div[@id="item_description"]').extract()
        description = [basic.cdata(basic.remove_tags(description[0]))]
        description = [description[0].replace(u"\u2044", "/")]
        return description

    def get_price(self, hxs):
        """Getting product prices.
        Gets regular and discount price if there is one."""
        price = hxs.select('//span[@id="divUnitPrice"]/text()').extract()
        if not price:
            price = hxs.select('//div[@id="product_price"]/span[1]/text()').extract()
        if not price:
            price = hxs.select('//div[@id="product_price"]/text()').extract()
        discount = hxs.select('//div[@id="product_price"]/span[@class="pc-salePrice"]/text()').extract()
        price = basic.clean_string(price[0])
        price = re.sub(" +", " ", price)
        price = price.replace("Price:", "")
        price = price.replace("Prix:", "")
        price = basic.cdata(price.strip())
        if discount:
            discount = basic.cdata_field(discount)
        return [price], discount

    def get_add_to_cart_id(self, page):
        """Gets add to cart id from the javascript on the page."""
        tmp = basic.get_middle_text(page, "if(isOrderStarted){", "}else")[0]
        tmp = basic.get_middle_text(tmp, "addItemToCart(", ",")
        return tmp

    def create_subproducts(self, page):
        """Gets information about colors from javascript.
        Returns field of dicts with information about colors.
        Those are really color variants for product."""
        try:
            tmp = page.split("var largeImages = new Array();")[1]
        except IndexError:
            print "This product has no images"
        else:
            tmp = tmp.split("colorDropdownArray")[0]
            images = basic.get_middle_text(tmp, "ProductGroupProduct(", ");")
            image_names = self.get_image_names(page)
            color_products = []
            for im in images:
                product = {}
                attributes = im.split("',")
                product['normal_image_url'] = "http://qa.partylite.biz/imaging/resize?fileName=/productcatalog/production"
                product['normal_image_url'] += self.custom_clean_string(attributes[26], True)
                product['description'] = basic.cdata(self.custom_clean_string(attributes[27]))
                product['color_id'] = self.custom_clean_string(attributes[7], True)
                product['swatch_color'] = basic.cdata(self.custom_clean_string(attributes[9]).replace(" ", ""))
                product['name'] = basic.cdata(image_names[product['color_id']])
                product['add_to_cart_id'] = self.custom_clean_string(attributes[0], True).replace(" ", "")
                product['price'] = self.custom_clean_string(attributes[10], True)
                color_products.append(product)
            return color_products
        return []

    def custom_clean_string(self, string, spaces=False):
        """Custom function for cleaning strings.
        Replaces new line, return and tab signs, also replaces multiple spaces with only one."""
        string = string.replace("\r", "")
        string = string.replace("\n", "")
        string = string.replace("\t", "")
        if not spaces:
            string = re.sub(' +', ' ', string)
        else:
            string = re.sub(' ', '', string)
        string = string.replace("'", "")
        return string

    def get_image_names(self, page):
        """Gets color names for color swatches."""
        temp = page.split("new DropDownInfo")
        names = {}
        for i in range(1, len(temp)):
            names[basic.get_middle_text(temp[i], "('", "'")[0]] = basic.get_middle_text(temp[i], "'", "')")[2]
        return  names

    def get_recommended(self, hxs):
        """Gets recommended product information.
        Returns information about recommended products as dict"""
        rec = hxs.select('//div[@id="right_column_container"]/div')
        new = []
        i = 0
        for r in rec:
            d = {}
            #to do: see how to get full href(different accounts)
            if not i:
                d['link'] = r.select('div/a/@href').extract()[0]
                d['image'] = "http://www.partylite.biz/imaging/resize"
                d['image'] += r.select('div/a/img/@src').extract()[0]
                d['name'] = r.select('div/a/text()').extract()[0]
                new.append(basic.cdata(simplejson.dumps(d)))
            i += 1
        return  new

    def get_reviews(self, page):
        """Gets average product rating.
        Returns string like 4.6 of 5 reviews."""
        id = self.get_review_id(page)
        url = "http://partylite.ugc.bazaarvoice.com/8504-en_us/" + id + "/reviews.djs?format=embeddedhtml"
        url = url.replace(" ", "")
        page = urllib2.urlopen(url).read()
        page = basic.get_middle_text(page, '<div class=\\"BVRRRatingNormalImage\\">', '<\/div>')
        if page:
            rating = basic.get_middle_text(page[0], 'alt=\\"', '\\"')[0]
            return [rating]
        else:
            return []

    def get_more_images(self, page):
        """Gets field of images."""
        try:
            script = basic.get_middle_text(page, "var moreImages", "var numberOfImages")[0]
        except IndexError:
            print "This product has no images."
        else:
            r = basic.get_middle_text(script, "moreImages[", "';")
            images = []
            # return cdata here if needed to go with absolute links
            for i in range(0, len(r)):
                if self.production:
                    images.append("http://www.partylite.biz" + r[i].split("= '")[1])
                else:
                    images.append("http://qa.partylite.biz" + r[i].split("= '")[1])
            return images
        return []

    def get_absolute(self, relatives):
        """Creates absolute path for images. [DEPRECATED]
        Please check if there is a need for this function again.
        If needed dimensions of images got from the client server
        can be changed here."""
        new = []
        print relatives
        os._exit(0)
        for i in range(0, len(relatives)):
            #add width, height here for different dimensions
            #don't change the url in here from qa to www it's meant to be qa always
            new.append("http://www.partylite.biz/imaging/resize?fileName=/productcatalog/production" + relatives[i])
        return new

    def get_review_id(self, page):
        """Gets review id that is used in javascript for reviews."""
        return basic.get_middle_text(page, 'productId: "', '"')[0]

    def write_subproducts(self, id, list, xml):
        """Writes child products to xml.
        Receives id, list and xml attributes, id is master product id,
        list is list of child products and xml is Xml instance"""
        for i in range(0, len(list)):
            item = PartyliteItem()
            item['master_product_id'] = id
            item['product_id'] = [id[0] + "_" + str(i)]
            item['in_stock'] = ["IN_STOCK"]
            for k, v in list[i].iteritems():
                item[k] = [v]
            xml.create_xml(item)
        return 1

    def parse_can(self, response):
        """Parse function for scraping canadian sites.
        There is meta information send in request in this function about language."""
        self.counter += 1
        basic.print_status(self.counter, self.total)
        item = PartyliteItem()
        hxs = HtmlXPathSelector(response)
        image_urls = []
        if  'redirect_urls' in response.request.meta:
            item['product_id'] = [self.get_id(response.request.meta['redirect_urls'][0])[0]]
            self.exc.code_handler(102, response.request.meta['redirect_urls'])
            if 'language' in response.request.meta:
                item['product_id'] = [self.get_id(response.request.meta['redirect_urls'][0])[0]
                                      + "_can" + "_" + response.meta['language']]
            try:
                index = self.products['product_ids'].index(self.get_id
                                (response.request.meta['redirect_urls'][0])[0])
                item['name'] = [basic.cdata(item['product_id'][0]
                                + self.products['names'][index])]
                self.products['status'][index] = 'no_avail'
            except KeyError as e:
                print "This %s id is not in list" % (item['product_id'][0])
            item['in_stock'] = ['NOT_AVAILABLE']
            item['product_id'] = self.remove_spaces(item['product_id'])
            self.xml.create_xml(item)
        else:
            index = self.products['product_ids'].index(self.get_id(response.url)[0])
            try:
                item['product_id'] = self.get_id(response.url)
                item['name'], item['shown_with'] = self.get_basic_info(hxs)
                item['description'] = self.get_description(hxs)
                if 'language' in response.meta:
                    item['product_id'] = [item['product_id'][0] + "_can" + "_" + response.meta['language']]
                response.meta['item'] = item
                page = " ".join(hxs.select('//html').extract())
                image_urls = self.get_more_images(page)
                item['normal_image_url'] = self.get_server_path_field(image_urls)
                item['in_stock'] = self.get_in_stock(hxs)
                color_products = self.create_subproducts(page)
                if color_products:
                    self.write_subproducts(item['product_id'], color_products, xml)
                else:
                    item['add_to_cart_id'] = self.get_add_to_cart_id(page)
                    item['custom_price'], item['custom_discount'] = self.get_price(hxs)
                self.products['status'][index] = "ran"
            except StandardError:
                basic.print_error()
                self.products['status'][index] = "error"
                self.exc.code_handler(100, response.url)
            else:
                item['product_id'] = self.remove_spaces(item['product_id'])
                self.xml.create_xml(item)
        if image_urls:
            item['image_urls'] = image_urls
        return item

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = party.get_settings_message(self.d)
        if self.counter < self.total:
            msg += "\nScraper didn't go through all products, please report"
        msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.products)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        logname = filename
        filename = "{0}_{1}".format(filename, self.d['lang'])
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        from modules.mail import Mail
        from modules.export_to_db import CommonExport
        exp = CommonExport()
        if self.upload:
            try:
                if self.d['lang'] == 'us':
                    exp.xml_to_db(self.name, filename, "55892247-1b92-4ff9-a8a3-33cc976f9341")
                else:
                    exp.xml_to_db(self.name, filename, "9cb6c676-c14f-403b-b94f-b981184e1de0")
                msg += "\n\nExport to database successful"
            except StandardError:
                msg += "\n\nExport to database failed"
        else:
            msg += "\n\nUpload to database not selected"
        mail = Mail()
        try:
            mail.send_mail(msg, "Partylite: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "Partylite: {0}".format(filename), self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = 'logs/{0}'.format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, logname), 'w') as f:
                f.write(msg)

    def get_id(self, url):
        """Gets id from product url."""
        return [url.split("&sku=")[1]]

    def get_server_path(self, url):
        """Gets server path for image url."""
        url = url.split("partylite.biz")[1]
        return self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg"

    def get_server_path_field(self, urls):
        """Getting server path for field of image urls."""
        new = []
        for url in urls:
            url = url.split("partylite.biz")[1]
            new.append(self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg")
        return new

    def remove_spaces(self, field):
        new = []
        for i in field:
            new.append(i.replace(' ', ''))
        return new

    def get_lists_from_excel(self):
        excel_path = "xls/{0}/{1}.xls".format(self.name, self.d['file'])
        xls = PartyliteExcel(path=excel_path, user=self.users['us'], production=self.production)
        self.products = dict()
        try:
            self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15)
            self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15)
            self.products['names'] = xls.read_excel_collumn(2, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            exc.code_handler(103, msg=msg)
        self.products= xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

Exemple #17

0

Afficher le fichier

Fichier : chome_spider.py Projet : marjevtic/testMarko

class ChomeSpider(CrawlSpider):
    name = "chome"
    allowed_domains = ["zmags.com"]
    start_urls = ["http://www.zmags.com/"]
    counter = 0

    def __init__(self, *a, **kw):
        super(ChomeSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5)
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(self.d['catalog_id'],
                                                                        self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.images_store = "/" + settings['IMAGES_STORE']
        self.total = len(self.no_urls['product_ids'])

    def parse(self, response):
        self.counter += 1
        hxs = HtmlXPathSelector(response)
        item = ChomeItem()
        print "IDs in excel feed: {0}".format(self.total)
        item['image_urls'] = self.parse_whole_xml()
        return item

    def parse_whole_xml(self):
        xml_dir = "xml/{0}".format(self.name)
        file_url = "https://svc.celebratinghome.com/ZMags.svc/ProductInfo1"
        downloader = Downloader()
        if self.d['download']:
            downloader.get_file(xml_dir, file_url, "client_feed")
        else:
            if not os.path.exists('xml/{0}/client_feed.xml'.format(self.name)):
                basic.warning("Feed file doesn't exist please de-select no download option")
                os._exit(2)
        self.number = 0
        xml_item = ChomeItem()
        urls_all = []
        for event, elem in iterparse('xml/{0}/client_feed.xml'.format(self.name)):
            if elem.tag == "{http://schemas.microsoft.com/ado/2007/08/dataservices/metadata}properties":
                for r in elem:
                    p = "{http://schemas.microsoft.com/ado/2007/08/dataservices}"
                    if r.tag == p + "Id" and r.text in self.no_urls['product_ids']:
                        index = self.no_urls['product_ids'].index(r.text)
                        self.no_urls['status'][index] = 'ran'
                        self.number += 1
                        urls = []
                        flag = 0
                        for x in elem:
                            if x.tag == p + "Id":
                                xml_item['product_id'] = [x.text]
                            elif x.tag == p + "EngLongDesc" and x.text is not None:
                                xml_item['description_english'] = [self.escape(basic.cdata(x.text))]
                            elif x.tag == p + "RetailPrice":
                                xml_item['custom_price'] = [x.text[:-2]]
                            elif x.tag == p + "SpnLongDesc" and x.text is not None:
                                xml_item['description_spanish'] = [self.escape(basic.cdata(x.text))]
                            elif x.tag == p + "PartNumber":
                                xml_item['add_to_cart_id'] = [x.text]
                            elif x.tag == p + "MaxQty":
                                xml_item['max_qty'] = [x.text]
                            elif x.tag == p + "TimeType":
                                xml_item['time_type'] = [x.text]
                            elif x.tag == p + "SpnName" and x.text is not None:
                                xml_item['name_spanish'] = [x.text]
                            elif x.tag == p + "EngName":
                                xml_item['name_english'] = [x.text]
                            elif x.tag == p + "ImagePath_Large" and x.text is not None:
                                urls.append(self.get_absolute(x.text))
                                xml_item['normal_image_url'] = [self.get_server_path(self.get_absolute(x.text))]
                            elif x.tag == p + "IsActive":
                                if x.text == 0:
                                    xml_item['in_stock'] = ["NOT_IN_STOCK"]
                                else:
                                    xml_item['in_stock'] = ['IN_STOCK']
                            else:
                                for i in range(1, 4):
                                    tag = p + "Alternate%sImagePath_Large" % (str(i))
                                    if x.tag == tag and x.text is not None:
                                        urls.append(self.get_absolute(x.text))
                                        xml_item['normal_image_url'].append(self.get_server_path(self.get_absolute(x.text)))
                                        # change image paths for normal_image_url and return urls
                        self.xml.create_xml(xml_item)
                        urls_all += urls
        for i in range(0, len(self.no_urls['status'])):
            if self.no_urls['status'][i] != 'ran':
                self.no_urls['status'][i] = 'not_found'
        return urls_all

    def get_server_path(self, url):
        path = self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg"
        return path

    def get_absolute(self, url):
        return "http://www.celebratinghome.com/" + url

    def escape(self, string):
        temp = HTMLParser.HTMLParser().unescape(string)
        return HTMLParser.HTMLParser().unescape(temp)

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}\n".format(datetime.now())
        if self.total - self.number:
            msg += "{0} id(s) from id list weren't found in feed".format(self.total - self.number)
            basic.warning(msg)
        else:
            msg += "All ids found in feed."
            basic.green(msg)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.no_urls)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        #if self.d['upload']:
            #exp = CommonExport()
            #try:
                #exp.xml_to_db(self.name, self.d['file'], "40b029c9-dff7-4bc1-b8bc-ef062960b24d")
                #msg += "\n\nExport to database successful"
            #except StandardError:
                #msg += "\n\nExport to database failed"
        #else:
            #msg += "\n\nUpload to database not selected"
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "CelebratingHome: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "CelebratingHome: {0}".format(filename), self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15)
            self.products['names'] = xls.read_excel_collumn(2, 15)
            self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(self.d['file'])
            self.exc.code_handler(103, msg=msg)
        self.products= xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

    def add_properties(self, xml):
        xml.add_property("description_english", "Description English", "text")
        xml.add_property("description_spanish", "Description Spanish", "text")
        xml.add_property("add_to_cart_id", "Add To Cart ID", "text")
        xml.add_property("max_qty", "Max Quantity", "text")
        xml.add_property("time_type", "Time Type", "text")
        xml.add_property("name_english", "Name English", "text")
        xml.add_property("name_spanish", "Name Spanish", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("custom_price", "Custom Price", "text")

Exemple #18

0

Afficher le fichier

class ChomeSpider(CrawlSpider):
    name = "chome"
    allowed_domains = ["zmags.com"]
    start_urls = ["http://www.zmags.com/"]
    counter = 0

    def __init__(self, *a, **kw):
        super(ChomeSpider, self).__init__(*a, **kw)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        terminal = DatabaseTerminal(sys.argv, self.name)
        self.d = terminal.get_arguments()
        self.xml = CommonXml()
        self.exc = ZmagsException(5)
        if self.d['database']:
            self.database = Database()
            self.database.connect()
            self.products, self.no_urls = self.database.select_products(
                self.d['catalog_id'], self.d['product_id'])
            self.database.disconnect()
        else:
            self.get_lists_from_excel()
        self.add_properties(self.xml)
        self.images_store = "/" + settings['IMAGES_STORE']
        self.total = len(self.no_urls['product_ids'])

    def parse(self, response):
        self.counter += 1
        hxs = HtmlXPathSelector(response)
        item = ChomeItem()
        print "IDs in excel feed: {0}".format(self.total)
        item['image_urls'] = self.parse_whole_xml()
        return item

    def parse_whole_xml(self):
        xml_dir = "xml/{0}".format(self.name)
        file_url = "https://svc.celebratinghome.com/ZMags.svc/ProductInfo1"
        downloader = Downloader()
        if self.d['download']:
            downloader.get_file(xml_dir, file_url, "client_feed")
        else:
            if not os.path.exists('xml/{0}/client_feed.xml'.format(self.name)):
                basic.warning(
                    "Feed file doesn't exist please de-select no download option"
                )
                os._exit(2)
        self.number = 0
        xml_item = ChomeItem()
        urls_all = []
        for event, elem in iterparse('xml/{0}/client_feed.xml'.format(
                self.name)):
            if elem.tag == "{http://schemas.microsoft.com/ado/2007/08/dataservices/metadata}properties":
                for r in elem:
                    p = "{http://schemas.microsoft.com/ado/2007/08/dataservices}"
                    if r.tag == p + "Id" and r.text in self.no_urls[
                            'product_ids']:
                        index = self.no_urls['product_ids'].index(r.text)
                        self.no_urls['status'][index] = 'ran'
                        self.number += 1
                        urls = []
                        flag = 0
                        for x in elem:
                            if x.tag == p + "Id":
                                xml_item['product_id'] = [x.text]
                            elif x.tag == p + "EngLongDesc" and x.text is not None:
                                xml_item['description_english'] = [
                                    self.escape(basic.cdata(x.text))
                                ]
                            elif x.tag == p + "RetailPrice":
                                xml_item['custom_price'] = [x.text[:-2]]
                            elif x.tag == p + "SpnLongDesc" and x.text is not None:
                                xml_item['description_spanish'] = [
                                    self.escape(basic.cdata(x.text))
                                ]
                            elif x.tag == p + "PartNumber":
                                xml_item['add_to_cart_id'] = [x.text]
                            elif x.tag == p + "MaxQty":
                                xml_item['max_qty'] = [x.text]
                            elif x.tag == p + "TimeType":
                                xml_item['time_type'] = [x.text]
                            elif x.tag == p + "SpnName" and x.text is not None:
                                xml_item['name_spanish'] = [x.text]
                            elif x.tag == p + "EngName":
                                xml_item['name_english'] = [x.text]
                            elif x.tag == p + "ImagePath_Large" and x.text is not None:
                                urls.append(self.get_absolute(x.text))
                                xml_item['normal_image_url'] = [
                                    self.get_server_path(
                                        self.get_absolute(x.text))
                                ]
                            elif x.tag == p + "IsActive":
                                if x.text == 0:
                                    xml_item['in_stock'] = ["NOT_IN_STOCK"]
                                else:
                                    xml_item['in_stock'] = ['IN_STOCK']
                            else:
                                for i in range(1, 4):
                                    tag = p + "Alternate%sImagePath_Large" % (
                                        str(i))
                                    if x.tag == tag and x.text is not None:
                                        urls.append(self.get_absolute(x.text))
                                        xml_item['normal_image_url'].append(
                                            self.get_server_path(
                                                self.get_absolute(x.text)))
                                        # change image paths for normal_image_url and return urls
                        self.xml.create_xml(xml_item)
                        urls_all += urls
        for i in range(0, len(self.no_urls['status'])):
            if self.no_urls['status'][i] != 'ran':
                self.no_urls['status'][i] = 'not_found'
        return urls_all

    def get_server_path(self, url):
        path = self.images_store + "/full/" + hashlib.sha1(
            url).hexdigest() + ".jpg"
        return path

    def get_absolute(self, url):
        return "http://www.celebratinghome.com/" + url

    def escape(self, string):
        temp = HTMLParser.HTMLParser().unescape(string)
        return HTMLParser.HTMLParser().unescape(temp)

    def spider_closed(self, spider):
        """Handles spider_closed signal from end of scraping.
        Handles usual end operations for scraper like writing xml, exporting
        to database and sending appropriate mail message."""
        msg = "Ran: {0}\n".format(datetime.now())
        if self.total - self.number:
            msg += "{0} id(s) from id list weren't found in feed".format(
                self.total - self.number)
            basic.warning(msg)
        else:
            msg += "All ids found in feed."
            basic.green(msg)
        # filename for writing xml
        if self.d['database']:
            try:
                self.database.connect()
                filename = self.database.get_name(self.d['catalog_id'])
                self.database.update_db(self.no_urls)
                self.database.disconnect()
                msg += "\nRan from interface.\n"
            except:
                msg += "\nUpdating database failed, please report."
        else:
            msg += "\nRan from console.\n"
            filename = self.d['file']
        self.xml.write_xml(self.name, filename)
        msg += self.exc.create_message(self.counter)
        #if self.d['upload']:
        #exp = CommonExport()
        #try:
        #exp.xml_to_db(self.name, self.d['file'], "40b029c9-dff7-4bc1-b8bc-ef062960b24d")
        #msg += "\n\nExport to database successful"
        #except StandardError:
        #msg += "\n\nExport to database failed"
        #else:
        #msg += "\n\nUpload to database not selected"
        from modules.mail import Mail
        mail = Mail()
        try:
            mail.send_mail(msg, "CelebratingHome: {0}".format(filename))
            if self.d['email']:
                mail.send_mail(msg, "CelebratingHome: {0}".format(filename),
                               self.d['email'])
        except:
            msg += "\nSending mail failed."
        if self.d['database']:
            path = "logs/{0}".format(self.name)
            if not os.path.exists(path):
                os.makedirs(path)
            with open("{0}/{1}".format(path, filename), 'w') as f:
                f.write(msg)

    def get_lists_from_excel(self):
        xls = DictExcel(basic.get_excel_path(self.name, self.d['file']))
        self.products = dict()
        try:
            self.products['product_ids'] = xls.read_excel_collumn_for_ids(
                1, 15)
            self.products['names'] = xls.read_excel_collumn(2, 15)
            self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15)
        except IOError as e:
            msg = "I/O error {0}: {1}".format(e.errno, e.strerror)
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        except StandardError:
            msg = "Error reading excel file"
            msg += "\nError occurred for given file: {0}".format(
                self.d['file'])
            self.exc.code_handler(103, msg=msg)
        self.products = xls.delete_duplicates_dict(self.products)
        self.products, self.no_urls = xls.separate_no_urls(self.products)
        self.products = xls._add_none_status(self.products)
        self.no_urls = xls._add_none_status(self.no_urls)

    def add_properties(self, xml):
        xml.add_property("description_english", "Description English", "text")
        xml.add_property("description_spanish", "Description Spanish", "text")
        xml.add_property("add_to_cart_id", "Add To Cart ID", "text")
        xml.add_property("max_qty", "Max Quantity", "text")
        xml.add_property("time_type", "Time Type", "text")
        xml.add_property("name_english", "Name English", "text")
        xml.add_property("name_spanish", "Name Spanish", "text")
        xml.add_property("in_stock", "In Stock", "text")
        xml.add_property("custom_price", "Custom Price", "text")