Exemple #1
0
    def parse(self, response):

        datetime = int(str(int(time.time()*100)))
        random.seed(1412112 + datetime)

        item = NuyolkItem()

        item['prod_id'] = int(str(datetime) + str(int(random.uniform(100000, 999999))))
        item['product_link'] = response.selector.xpath('/html/head/meta[12]/@content').extract()[0]

        item['merchant_prod_id'] = response.selector.xpath('/html/head/meta[17]/@content').extract()[0]
        item['merchant_id'] = "70856L"

        item['brand'] = response.selector.xpath('//h1[@class="brand"]/a/text()').extract()[0]
        item['short_desc'] = response.selector.xpath('//h1[@class="product-name"]/text()').extract()[0]
        item['long_desc'] = response.selector.xpath('/html/head/meta[4]/@content').extract()[0]
        item['primary_color'] = "" #later

        item['currency'] = response.selector.xpath('/html/head/meta[19]/@content').extract()[0]

        #If item is on sale,
        if (response.selector.xpath("//span[@class='price-standard']/text()").extract() != []):
            item['price_orig'] = response.selector.xpath("//span[@class='price-standard']/text()").extract()[0][1:]
            item['price_sale'] = response.selector.xpath("//span[@class='price-sales']/text()").extract()[0][1:]
            item['price_perc_discount'] = int((1 - float(item['price_sale'])/float(item['price_orig']))*100)
            item['price'] = item['price_sale']
        else:
            item['price_orig'] = response.selector.xpath("//span[@class='price-sales']/text()").extract()[0][1:]
            item['price'] = item['price_orig']

        item['image_urls'] = response.selector.xpath('//*[@class="zoom masterTooltip"]/img/@src').extract() #new
        item['img_1'] = ""
        item['img_2'] = ""
        item['img_3'] = ""
        item['img_4'] = ""
        item['img_5'] = ""

        #new
        item['mcats'] = response.selector.xpath('//*[@id="main"]/div/div/ol/li/a/text()').extract()

        for i in range(0, len(item['mcats'])):
            attr = 'mcat_' + str(i+1)
            item[attr] = item['mcats'][i]

        item['cat_code'] = ""
        item['cat_1'] = "" #deprecate
        item['cat_2'] = "" #deprecate
        item['cat_3'] = "" #deprecate

        tags = [str(response.selector.xpath('//h1[@class="brand"]/a/text()').extract()[0]), str(response.selector.xpath('//h1[@class="product-name"]/text()').extract()[0]), str(" ".join(item['mcats'])), str(response.selector.xpath('/html/head/meta[4]/@content').extract()[0])]
        item['tags'] = " ".join(tags)

        item['date_added'] = [unicode(str(time.strftime("%d/%m/%Y %H:%M:%S")), "utf-8")]

        yield item
Exemple #2
0
 def parse(self, response):
     item = NuyolkItem()  #Don't change!
     try:
         item['brand'] = str(
             response.selector.xpath(
                 '//*[@class="productDesc"]//p[@itemprop="brand"]/a/text()'
             ).extract()[0]).strip()
         mcats = response.selector.xpath(
             '//nav[@id="breadcrumb"]//li/a/text()').extract()
         item['mcats'] = mcats[1:len(mcats)]
         item['merchant_prod_id'] = str(
             response.selector.xpath(
                 '//form/input[@name="productId"]/@value').extract()[0])
         item['product_link'] = str(
             response.selector.xpath(
                 '//*[@id="canonicalUrl"]/@href').extract()[0])
         yield item
     except Exception as e:
         return
Exemple #3
0
    def parse(self, response):
        datetime = int(str(int(time.time()*100)))
        random.seed(1412112 + datetime)

        item = NuyolkItem()
        item['is_available'] = True
        item['affiliate_partner'] = "viglink"

        item['prod_id'] = str(str(datetime) + str(int(random.uniform(100000, 999999))))
        item['product_link'] = response.url

        item['merchant'] = "Lamps Plus"
        try:
            item['merchant_prod_id'] = response.selector.xpath('//*[@id="pdProdSku"]/text()').extract()[0].replace('- Style # ', '')
        except:
            return
        item['merchant_id'] = "P2B2J5"

        try:
            item['brand'] = response.selector.xpath('//*[@id="pnlBrand"]/@content').extract()[0]
        except:
            item['brand'] = ""

        try:
            item['short_desc'] = response.selector.xpath('//*[@id="h1ProductName"]/text()').extract()[0].strip()
        except:
            return

        ld = [response.selector.xpath('//*[@id="pdKeySentence"]/text()').extract()[0].strip()]
        ld2 = [response.selector.xpath('//p[@itemprop="description"]/text()').extract()[0].strip()]
        ld3 = response.selector.xpath('//*[@id="pdDescBullets"]/li/text()').extract()
        ld.extend(ld2)
        ld.extend(ld3)
        skipwords = ["clean", "instructions", "cm", "\" ", "wash", "in.", "inch", "size", "mm ", "size", "weighs", "lbs."]
        for w in skipwords:
            ld = list(np.array(ld)[np.array([w not in x for x in ld])])
        item['long_desc'] = " | ".join(ld).strip()
        item['primary_color'] = "" #later

        item['currency'] = response.selector.xpath('//meta[@itemprop="priceCurrency"]/@content').extract()[0]
        if (item['currency'] == 'USD'):
            item['currency_symbol'] = '$'
        else:
            item['currency_symbol'] = '?' ##TODO

        #If item is on sale,
        #[4:].replace(",", "")
        try:
            item['price_sale'] = int(float(response.selector.xpath("//*[@itemprop='lowPrice']/@content").extract()[0].replace(",", "")))
            item['price_orig'] = int(float(response.selector.xpath("//*[@itemprop='highPrice']/@content").extract()[0].replace(",", "")))
            item['price_perc_discount'] = int((1 - float(item['price_sale'])/float(item['price_orig']))*100)
            item['price'] = item['price_sale']
            item['on_sale'] = True
        except:
            item['price_orig'] = int(float(response.selector.xpath("//*[@itemprop='price']/@content").extract()[0].replace(",", "")))
            item['price'] = item['price_orig']
            item['price_sale'] = item['price_orig']
            item['price_perc_discount'] = 0
            item['on_sale'] = False

        imgs = response.selector.xpath('//*[@id="pdAddlImgs"]//img/@src').extract()
        item['image_urls'] = [x.replace(find_between(x, 'fpx?', 'fmt=jpeg'), "") for x in imgs]
        #response.selector.xpath('//*[@class="zoom masterTooltip"]/img/@src').extract() #new
        item['img_1'] = ""
        item['img_2'] = ""
        item['img_3'] = ""
        item['img_4'] = ""
        item['img_5'] = ""

        for i in range(0,6):
            attr = 'imglink_' + str(i+1)
            try:
                item[attr] = item['image_urls'][i]
            except:
                item[attr] = ""

        mcats = response.selector.xpath('//*[@id="divBreadCrumb"]//text()').extract()
        mcats = [x.strip() for x in mcats]
        mcats = filter(lambda x: x != "" and x != "|", mcats)
        mcats = mcats[1:-2]

        item['mcat_code'] = ""
        item['image_urls'] = ""

        for i in range(0, 5):
            attr = 'mcat_' + str(i + 1)
            try:
                if i == len(mcats) - 1:
                    item[attr] = ""
                else:
                    item[attr] = mcats[i]
            except:
                item[attr] = ""

        item['cat_code'] = ""
        item['cat_1'] = "" #deprecate
        item['cat_2'] = "" #deprecate
        item['cat_3'] = "" #deprecate

        t = [item['brand'], item['short_desc'], item['mcat_1'], mcats[1:], item['long_desc']]
        item['tags'] = " ".join(list(numpy.hstack(t)))

        item['date_added'] = str(time.strftime("%d/%m/%Y %H:%M:%S"))
        item['date_last_updated'] = str(time.strftime("%d/%m/%Y %H:%M:%S"))

        yield item
Exemple #4
0
    def parse(self, response):
        datetime = int(str(int(time.time() * 100)))
        random.seed(1412112 + datetime)

        item = NuyolkItem()
        item['is_available'] = True
        item['affiliate_partner'] = "viglink"

        item['prod_id'] = str(
            str(datetime) + str(int(random.uniform(100000, 999999))))
        item['product_link'] = response.url

        item['merchant'] = "Society6"
        try:
            mpi = response.xpath('//script[contains(., "dataLayer = ")]/text()'
                                 ).re('\"id\"\:\"(.*)')[0]
            mpi = mpi.split("\"")[0]
            item['merchant_prod_id'] = mpi
        except:
            pass
        item['merchant_id'] = "7599C0"

        try:
            brand = response.selector.xpath(
                '//*[@class="user-avatar"]/a/img/@alt').extract()[0]
            brand = brand.split(" (")[0].strip()
            item['brand'] = brand
        except:
            item['brand'] = ""
        sd = response.selector.xpath('//title/text()').extract()[0]
        sd = sd.split(" by ")[0].capitalize()
        item['short_desc'] = sd
        try:
            ld = [
                response.selector.xpath(
                    '//*[@id="about-the-art-description"]/text()').extract()
                [0].strip()
            ]
        except:
            ld = []
        ld2 = response.selector.xpath('//*[@id="product-description"]//text()'
                                      ).extract()[0].strip().split(". ")
        ld2last = ld2[-1]
        ld2 = [x + "." for x in ld2[:-1]]
        ld2.append(ld2last)
        ld.extend(ld2)
        skipwords = [
            "clean", "instructions", "cm", "\" ", "wash", "in.", "inch",
            "size", "mm ", "size", "weighs", "lbs."
        ]
        for w in skipwords:
            ld = list(np.array(ld)[np.array([w not in x for x in ld])])
        item['long_desc'] = " | ".join(ld).strip()
        item['primary_color'] = ""  #later
        item['currency'] = response.selector.xpath(
            '//meta[@property="og:price:currency"]/@content').extract()[0]
        if (item['currency'] == 'USD'):
            item['currency_symbol'] = '$'
        else:
            item['currency_symbol'] = '?'  ##TODO

        #If item is on sale,
        #[4:].replace(",", "")
        try:
            #####TODO (cannot find products on sale)
            item['price_sale'] = int(
                float(
                    response.selector.xpath(
                        '//meta[@property="og:price:sale"]/@content').extract(
                        )[0].replace(",", "")))
            item['price_orig'] = int(
                float(
                    response.selector.xpath(
                        '//meta[@property="og:price:orig"]/@content').extract(
                        )[0].replace(",", "")))
            item['price_perc_discount'] = int(
                (1 - float(item['price_sale']) / float(item['price_orig'])) *
                100)
            item['price'] = item['price_sale']
            item['on_sale'] = True
        except:
            item['price_orig'] = int(
                float(
                    response.selector.xpath(
                        '//meta[@property="og:price:amount"]/@content').
                    extract()[0].replace(",", "")))
            item['price'] = item['price_orig']
            item['price_sale'] = item['price_orig']
            item['price_perc_discount'] = 0
            item['on_sale'] = False
        item['image_urls'] = response.selector.xpath(
            '//*[@id="product-image-main"]//img/@src').extract()
        #response.selector.xpath('//*[@class="zoom masterTooltip"]/img/@src').extract() #new
        item['img_1'] = ""
        item['img_2'] = ""
        item['img_3'] = ""
        item['img_4'] = ""
        item['img_5'] = ""

        for i in range(0, 6):
            attr = 'imglink_' + str(i + 1)
            try:
                item[attr] = item['image_urls'][i]
            except:
                item[attr] = ""

        mcats = response.selector.xpath(
            '//*[@class="breadcrumb_v2"]//span/text()').extract()
        mcats = filter(lambda x: x != "/", mcats)
        mcats = mcats[1:-1]

        item['mcat_code'] = ""
        item['image_urls'] = ""

        for i in range(0, 5):
            attr = 'mcat_' + str(i + 1)
            try:
                if i == len(mcats) - 1:
                    item[attr] = ""
                else:
                    item[attr] = mcats[i]
            except:
                item[attr] = ""

        item['cat_code'] = ""
        item['cat_1'] = ""  #deprecate
        item['cat_2'] = ""  #deprecate
        item['cat_3'] = ""  #deprecate

        t = [
            item['brand'], item['short_desc'], item['mcat_1'], mcats[1:],
            item['long_desc']
        ]
        item['tags'] = " ".join(list(numpy.hstack(t)))

        item['date_added'] = str(time.strftime("%d/%m/%Y %H:%M:%S"))
        item['date_last_updated'] = str(time.strftime("%d/%m/%Y %H:%M:%S"))

        yield item
Exemple #5
0
    def parse(self, response):
        try:
            sold_out_msg = response.selector.xpath(
                '//*[@class="sold-out-header"]/text()').extract()[0]
            print("SOLD OUT--SKIPPED!")
            return
        except:
            datetime = int(str(int(time.time() * 100)))
            random.seed(1412112 + datetime)

            item = NuyolkItem()
            item['is_available'] = True
            item['affiliate_partner'] = "viglink"

            item['prod_id'] = str(datetime) + str(
                int(random.uniform(100000, 999999)))
            item['product_link'] = response.selector.xpath(
                '/html/head/meta[23]/@content').extract()[0]

            item['merchant'] = "HBX"
            item['merchant_prod_id'] = response.selector.xpath(
                '//*[@id="product-summary"]/@data-id').extract()[0]  #skipped
            item['merchant_id'] = "70856L"

            item['brand'] = response.selector.xpath(
                '//h1[@class="brand"]/text()').extract()[0]
            item['short_desc'] = response.selector.xpath(
                '//h1[@class="brand"]/text()').extract()[0]
            ld = response.selector.xpath(
                './/*[@class="description"]/p/text()').extract()
            item['long_desc'] = " | ".join(ld).strip()
            item['primary_color'] = ""  #later

            item['currency'] = response.selector.xpath(
                '//*[@class="currency-dropdown"]/span/text()').extract()[0]
            if (item['currency'] == 'USD'):
                item['currency_symbol'] = '$'
            else:
                item['currency_symbol'] = '?'

            #If item is on sale,
            try:
                item['price_sale'] = int(
                    float(
                        response.selector.xpath(
                            "//span[@class='sale-price']/text()").extract()[0]
                        [4:].replace(",", "")))
                item['price_orig'] = int(
                    float(
                        response.selector.xpath(
                            "//span[@class='regular-price']/text()").extract()
                        [0][4:].replace(",", "")))
                item['price_perc_discount'] = int(
                    (1 - float(item['price_sale']) / float(item['price_orig']))
                    * 100)
                item['price'] = item['price_sale']
                item['on_sale'] = True
            except:
                item['price_orig'] = int(
                    float(
                        response.selector.xpath(
                            "//span[@class='regular-price']/text()").extract()
                        [0][4:].replace(",", "")))
                item['price'] = item['price_orig']
                item['price_sale'] = item['price_orig']
                item['price_perc_discount'] = 0
                item['on_sale'] = False

            item['image_urls'] = response.selector.xpath(
                './/ul[@class="slides"]/li/img/@src').extract()
            #response.selector.xpath('//*[@class="zoom masterTooltip"]/img/@src').extract() #new
            item['img_1'] = ""
            item['img_2'] = ""
            item['img_3'] = ""
            item['img_4'] = ""
            item['img_5'] = ""

            for i in range(0, 6):
                attr = 'imglink_' + str(i + 1)
                try:
                    item[attr] = item['image_urls'][i]
                except:
                    item[attr] = ""

            mcats = response.selector.xpath(
                './/ol[contains(@class, "breadcrumb") and contains(@class, "hidden-xs")]/li/a/text()'
            ).extract()
            mcats = [mc.strip() for mc in mcats]
            item['mcat_code'] = ""
            item['image_urls'] = ""

            for i in range(0, 5):
                attr = 'mcat_' + str(i + 1)
                try:
                    if i == len(mcats) - 1:
                        item[attr] = ""
                    elif i == 0:
                        if 'women' in response.url:
                            item[attr] = 'Women'
                        else:
                            item[attr] = 'Men'
                    else:
                        item[attr] = mcats[i]
                except:
                    item[attr] = ""

            item['cat_code'] = ""
            item['cat_1'] = ""  #deprecate
            item['cat_2'] = ""  #deprecate
            item['cat_3'] = ""  #deprecate

            t = [
                item['brand'], item['short_desc'], item['mcat_1'], mcats[1:],
                item['long_desc']
            ]
            item['tags'] = " ".join(list(numpy.hstack(t)))

            item['date_added'] = str(time.strftime("%d/%m/%Y %H:%M:%S"))
            item['date_last_updated'] = str(time.strftime("%d/%m/%Y %H:%M:%S"))

            yield item
Exemple #6
0
    def parse(self, response):
        datetime = int(str(int(time.time() * 100)))
        random.seed(1412112 + datetime)

        item = NuyolkItem()
        item['is_available'] = True
        item['affiliate_partner'] = "viglink"

        item['prod_id'] = str(
            str(datetime) + str(int(random.uniform(100000, 999999))))
        item['product_link'] = response.url

        item['merchant'] = "Belk"
        item['merchant_prod_id'] = response.url.split("/")[-1].replace(
            ".html", "")
        #item['upc'] ##TODO
        item['merchant_id'] = "IXR49N"

        try:
            item['brand'] = response.selector.xpath(
                '//*[@itemprop="brand"]/text()').extract()[0]
        except:
            item['brand'] = ""
        item['short_desc'] = response.selector.xpath(
            '//*[@class="brand-name"]/text()').extract()[0].strip()
        ld = response.selector.xpath(
            '//meta[@name="description"]/@content').extract()
        ld.extend(
            response.selector.xpath(
                '//ul[@class="copyline"]/li/text()').extract())
        skipwords = [
            "clean", "instructions", "cm", "wash", "in.", "inch", "size",
            "mm ", "size"
        ]
        for w in skipwords:
            ld = list(np.array(ld)[np.array([w not in x for x in ld])])
        item['long_desc'] = " | ".join(ld).strip()
        item['primary_color'] = ""  #later

        item['currency'] = response.selector.xpath(
            '//meta[@itemprop="priceCurrency"]/@content').extract()[0]
        if (item['currency'] == 'USD'):
            item['currency_symbol'] = '$'
        else:
            item['currency_symbol'] = '?'  ##TODO

        #If item is on sale,
        #[4:].replace(",", "")
        try:
            item['price_sale'] = int(
                float(
                    response.selector.xpath(
                        "//*[@class='price-sales']/span/text()").extract()
                    [0].replace(",", "")))
            item['price_orig'] = int(
                float(
                    response.selector.xpath(
                        "//*[@class='price-standard']/text()").extract()
                    [0].replace("Orig. $", "").replace(",", "")))
            item['price_perc_discount'] = int(
                (1 - float(item['price_sale']) / float(item['price_orig'])) *
                100)
            item['price'] = item['price_sale']
            item['on_sale'] = True
        except:
            try:
                item['price_orig'] = int(
                    float(
                        response.selector.xpath(
                            "//*[@class='standardprice']/input/@value").
                        extract()[0].replace(",", "")))
            except:
                try:
                    item['price_orig'] = int(
                        float(
                            response.selector.xpath(
                                "//*[@class='standardprice']/span/text()").
                            extract()[0].replace(",", "")))
                except:
                    print("??? SKIPPED!")
                    return
            item['price'] = item['price_orig']
            item['price_sale'] = item['price_orig']
            item['price_perc_discount'] = 0
            item['on_sale'] = False

        item['image_urls'] = response.selector.xpath(
            '//div[@class="product-thumbnails"]//li/a/@href').extract()
        #response.selector.xpath('//*[@class="zoom masterTooltip"]/img/@src').extract() #new
        item['img_1'] = ""
        item['img_2'] = ""
        item['img_3'] = ""
        item['img_4'] = ""
        item['img_5'] = ""

        for i in range(0, 6):
            attr = 'imglink_' + str(i + 1)
            try:
                item[attr] = item['image_urls'][i]
            except:
                item[attr] = ""

        mcats = response.xpath(
            '//script[contains(., "var utag_data")]/text()').re(
                'product_category\"\: \[([^]]+)\]')[0].strip().replace(
                    '"', "")
        mcats = mcats.split(" > ")

        item['mcat_code'] = ""
        item['image_urls'] = ""

        for i in range(0, 5):
            attr = 'mcat_' + str(i + 1)
            try:
                if i == len(mcats) - 1:
                    item[attr] = ""
                else:
                    item[attr] = mcats[i]
            except:
                item[attr] = ""

        item['cat_code'] = ""
        item['cat_1'] = ""  #deprecate
        item['cat_2'] = ""  #deprecate
        item['cat_3'] = ""  #deprecate

        t = [
            item['brand'], item['short_desc'], item['mcat_1'], mcats[1:],
            item['long_desc']
        ]
        item['tags'] = " ".join(list(numpy.hstack(t)))

        item['date_added'] = str(time.strftime("%d/%m/%Y %H:%M:%S"))
        item['date_last_updated'] = str(time.strftime("%d/%m/%Y %H:%M:%S"))

        yield item
Exemple #7
0
    def parse(self, response):
        datetime = int(str(int(time.time() * 100)))  #Don't change!
        random.seed(1412112 + datetime)  #Don't change!
        item = NuyolkItem()  #Don't change!
        item['brand'] = response.selector.xpath(
            '//span[@itemprop="brand"]/a/text()').extract()[0]
        item['cat_code'] = ""
        item['cat_1'] = ""  #deprecate
        item['cat_2'] = ""  #deprecate
        item['cat_3'] = ""  #deprecate
        item['currency'] = str(
            response.selector.xpath(
                '//div[@class="currency"]/span[@class="code"]/text()').extract(
                )[0])
        if item['currency'] == 'USD':
            item['currency_symbol'] = '$'
        else:
            item['currency_symbol'] = '?'

        item['date_added'] = str(time.strftime("%d/%m/%Y %H:%M:%S"))
        item['date_last_updated'] = str(time.strftime("%d/%m/%Y %H:%M:%S"))
        #item['image_urls'] = response.selector.xpath('//ul[@id="image-carousel"]/li/a/@href').extract()
        item['image_urls'] = ""
        item['img_1'] = ""
        item['img_2'] = ""
        item['img_3'] = ""
        item['img_4'] = ""
        item['img_5'] = ""
        try:
            item['long_desc'] = response.selector.xpath(
                '//div[@itemprop="description"]//text()').extract()[0]
        except:
            item['long_desc'] = ""
        mcats = response.selector.xpath(
            './/ul[@class="shoptiques-breadcrumb"]/li/a/text()').extract()

        for i in range(0, 5):
            attr = 'mcat_' + str(i + 1)
            try:
                item[attr] = mcats[i]
            except:
                item[attr] = ""

        item['mcat_code'] = ""  #later #Do NLP predictions
        item['merchant_id'] = "3O056R"
        item['merchant_prod_id'] = ''
        try:
            orig = int(
                float(
                    response.selector.xpath(
                        '//*[@id="product-detail"]//*[contains(@class, "retail")]/text()'
                    ).extract()[0][1:]))
            sale = int(
                float(
                    response.selector.xpath(
                        '//*[@id="product-detail"]//*[contains(@class, "sale")]/text()'
                    ).extract()[0][1:]))
            if (orig != sale):
                item['price_orig'] = int(orig)
                item['price_sale'] = int(sale)
                item['price_perc_discount'] = int(100 - 100 * (sale / orig))
                item['on_sale'] = True
                item['price'] = int(item['price_sale'])
            else:
                item['price_orig'] = orig
                item['price'] = orig
                item['on_sale'] = False
        except:
            try:
                item['price_orig'] = int(
                    float(
                        response.selector.xpath(
                            '//div[@class="product-name"]/span[@id="product-price"]/span/text()'
                        ).extract()[0][1:]))
            except Exception as e:
                item['price_orig'] = int(
                    float(
                        response.selector.xpath(
                            '//div[@class="product-name"]/span[@id="product-price"]/span/span[1]/text()'
                        ).extract()[0][1:]))
            item['price'] = item['price_orig']
            item['price_sale'] = item['price_orig']
            item['on_sale'] = False
            item['price_perc_discount'] = 0
        item['primary_color'] = ""  #later
        item['prod_id'] = str(datetime) + str(
            int(random.uniform(100000, 999999)))  #Don't change!
        item['product_link'] = str(
            response.selector.xpath(
                '//head/link[@rel="canonical"]/@href').extract()[0])
        item['short_desc'] = str(
            response.selector.xpath(
                '//div[@id="product-detail"]/div[@class="product-name"]/h1/text()'
            ).extract()[0].strip()).strip().replace("  ", "")

        t = [
            item['brand'], item['short_desc'], "Women", mcats,
            item['long_desc']
        ]
        item['tags'] = " ".join(list(numpy.hstack(t)))

        item['imglinks'] = response.selector.xpath(
            '//ul[@id="image-carousel"]/li/a/@href').extract()
        for i in range(0, 6):
            attr = 'imglink_' + str(i + 1)
            if i < len(item['imglinks']):
                item[attr] = str(item['imglinks'][i])
            else:
                item[attr] = ""
        item['imglinks'] = ""
        item['is_available'] = True  #Don't change! #Fix later!
        item['affiliate_partner'] = "viglink"
        item['merchant'] = "Shoptiques"
        yield item
Exemple #8
0
    def parse(self, response):
        datetime = int(str(int(time.time() * 100)))
        random.seed(1412112 + datetime)  #Don't change!
        item = NuyolkItem()  #Don't change!
        try:
            test = str(
                response.selector.xpath(
                    '//div[@id="product-old"]/form/input[@name="product"]/@value'
                ).extract()[0])

            item['brand'] = ""  #Needs post-processing!

            item['cat_code'] = ""
            item['cat_1'] = ""  #deprecate
            item['cat_2'] = ""  #deprecate
            item['cat_3'] = ""  #deprecate

            curr = '//*[@id="currency-widget"]/li[' + str(
                len(
                    response.selector.xpath('//*[@id="currency-widget"]/li').
                    extract())) + ']/a/span'
            curr_temp = response.selector.xpath(curr).extract()[0]
            item['currency'] = str(curr_temp[curr_temp.index(">") +
                                             1:curr_temp.rindex("<")])
            item['date_added'] = str(time.strftime("%d/%m/%Y %H:%M:%S"))

            #item['image_urls'] = response.selector.xpath('//ul[@class="thumbs"]/li/a/img/@src').extract()
            item['img_1'] = ""
            item['img_2'] = ""
            item['img_3'] = ""
            item['img_4'] = ""
            item['img_5'] = ""

            long_desc = response.selector.xpath(
                '//p[@class="description"]/text()').extract()
            #item['long_desc'] = " ".join(long_desc).replace('\n', '').replace('\t', '').replace('\r', '').replace(u"\u2022", "")
            item['long_desc'] = " ".join(long_desc)

            item['mcats'] = ""  #later #Do NLP predictions
            item['mcat_1'] = ""
            item['mcat_1'] = ""
            item['mcat_1'] = ""
            item['mcat_1'] = ""
            item['mcat_1'] = ""
            item['mcat_1'] = ""
            item['mcat_code'] = ""

            item['merchant_id'] = "JOQ3F3"
            item['merchant_prod_id'] = str(
                response.selector.xpath(
                    '//div[@id="product-old"]/form/input[@name="product"]/@value'
                ).extract()[0])

            try:
                if (response.selector.xpath(
                        "//div[@class='price-mobile']/p/span[@class='price']/text()"
                ).extract()[0] != response.selector.xpath(
                        "//div[@class='price-mobile']/h3/span[@class='price']/text()"
                ).extract()[0]):
                    item['price_orig'] = int(
                        float(
                            response.selector.xpath(
                                "//div[@class='price-mobile']/p/span[@class='price']/text()"
                            ).extract()[0][1:]))
                    item['price_sale'] = int(
                        float(
                            response.selector.xpath(
                                "//div[@class='price-mobile']/h3/span[@class='price']/text()"
                            ).extract()[0][1:]))
                    item['price_perc_discount'] = int(100 - ((float(
                        response.selector.xpath(
                            "//div[@class='price-mobile']/h3/span[@class='price']/text()"
                        ).extract()[0][1:]
                    )) / (float(
                        response.selector.xpath(
                            "//div[@class='price-mobile']/p/span[@class='price']/text()"
                        ).extract()[0][1:]))) * 100)
                    item['price'] = item['price_sale']
                else:
                    item['price_orig'] = int(
                        float(
                            response.selector.xpath(
                                "//div[@class='price-mobile']/h3/span[@class='price']/text()"
                            ).extract()[0][1:]))
                    item['price'] = item['price_orig']
            except IndexError:
                item['price_orig'] = int(
                    float(
                        response.selector.xpath(
                            "//div[@class='price-mobile']/h3/span[@class='price']/text()"
                        ).extract()[0][1:]))
                item['price'] = item['price_orig']

            item['primary_color'] = ""  #later
            item['prod_id'] = int(
                str(datetime) +
                str(int(random.uniform(100000, 999999))))  #Don't change!
            item['product_link'] = str(
                response.selector.xpath(
                    '//link[@rel="canonical"]/@href').extract()[0])

            item['short_desc'] = str(
                response.selector.xpath('//title/text()').extract()[0])
            tags = [
                str(item['brand']),
                str(item['short_desc']), item['long_desc']
            ]  #str(" ".join(item['mcats'])),
            item['tags'] = " ".join(tags)

            item['imglinks'] = response.selector.xpath(
                '//ul[@class="thumbs"]/li/a/img/@src').extract()
            for i in range(0, 6):
                attr = 'imglink_' + str(i + 1)
                if i < len(item['imglinks']):
                    item[attr] = str(item['imglinks'][i])
                else:
                    item[attr] = ""

            item['is_available'] = True  #Don't change! #Fix later!
            item['affiliate_partner'] = "viglink"
            yield item
        except Exception as e:
            return
Exemple #9
0
    def parse(self, response):
        def find_between(s, first, last):
            try:
                start = s.index(first) + len(first)
                end = s.index(last, start)
                return s[start:end]
            except ValueError:
                return ""

        datetime = int(str(int(time.time() * 100)))  #Don't change!
        random.seed(1412112 + datetime)  #Don't change!
        item = NuyolkItem()  #Don't change!
        item['prod_id'] = str(datetime) + str(
            int(random.uniform(100000, 999999)))  #Don't change!
        item['affiliate_partner'] = "viglink"
        item['brand'] = response.selector.xpath(
            '//a[@id="product-brand"]/text()').extract()[0]
        ld = [
            response.selector.xpath(
                '//div[@id="details"]//span/text()').extract()[0]
        ]
        ld.extend(
            response.selector.xpath(
                './/div[@id="productDetail-details"]//p/text()').extract())
        skipwords = [
            "clean", "instructions", "cm", "wash", "in.", "inch", "size",
            "mm ", "size"
        ]
        for w in skipwords:
            ld = list(np.array(ld)[np.array([w not in x for x in ld])])

        item['long_desc'] = " | ".join(list(numpy.hstack(ld)))

        item['short_desc'] = response.selector.xpath(
            '//h1[@class="heading1"]/text()').extract()[0]
        item['product_link'] = response.selector.xpath(
            '//head/link[@rel="canonical"]/@href').extract()[0]
        item['cat_1'] = ""
        item['cat_2'] = ""
        item['cat_3'] = ""
        item['cat_code'] = ""
        item['date_added'] = str(time.strftime("%d/%m/%Y %H:%M:%S"))
        item['date_last_updated'] = str(time.strftime("%d/%m/%Y %H:%M:%S"))
        item['image_urls'] = ""
        item['img_1'] = ""
        item['img_2'] = ""
        item['img_3'] = ""
        item['img_4'] = ""
        item['img_5'] = ""
        try:
            item['imglink_1'] = response.selector.xpath(
                '//ul[@id="carousel"]/li[1]/a/@href').extract()[0]
        except IndexError:
            item['imglink_1'] = ""
        try:
            item['imglink_2'] = response.selector.xpath(
                '//ul[@id="carousel"]/li[2]/a/@href').extract()[0]
        except IndexError:
            item['imglink_2'] = ""
        try:
            item['imglink_3'] = response.selector.xpath(
                '//ul[@id="carousel"]/li[3]/a/@href').extract()[0]
        except IndexError:
            item['imglink_3'] = ""
        try:
            item['imglink_4'] = response.selector.xpath(
                '//ul[@id="carousel"]/li[4]/a/@href').extract()[0]
        except IndexError:
            item['imglink_4'] = ""
        try:
            item['imglink_5'] = response.selector.xpath(
                '//ul[@id="carousel"]/li[5]/a/@href').extract()[0]
        except IndexError:
            item['imglink_5'] = ""
        try:
            item['imglink_6'] = response.selector.xpath(
                '//ul[@id="carousel"]/li[6]/a/@href').extract()[0]
        except IndexError:
            item['imglink_6'] = ""

        mcats = response.selector.xpath(
            './/ul[@itemprop="category"]//li//a/text()').extract()

        for i in range(0, 5):
            attr = 'mcat_' + str(i + 1)
            try:
                item[attr] = mcats[i]
            except:
                item[attr] = ""

        item['mcat_code'] = ""
        item['merchant'] = "AHAlife"
        item['merchant_id'] = "SN4NSZ"
        item['merchant_prod_id'] = find_between(response.url, "/product/", "/")
        item['is_available'] = True
        item['currency'] = response.xpath(
            '//meta[@itemprop="priceCurrency"]/@content').extract()[0]

        if (item['currency'] == 'USD'):
            item['currency_symbol'] = '$'
        else:
            item['currency_symbol'] = '?'

        item['price_orig'] = int(
            float(
                response.selector.xpath(
                    '//div[@class="product-price sku-price"]/@data-base-price'
                ).extract()[0]))
        item['price'] = item['price_orig']
        item['price_sale'] = item['price_orig']
        item['on_sale'] = False  #BOOLEAN
        item['price_perc_discount'] = 0
        item['primary_color'] = ""

        t = [
            item['brand'], item['short_desc'], item['mcat_1'], mcats[1:],
            item['long_desc']
        ]
        item['tags'] = " ".join(list(numpy.hstack(t)))

        yield item
Exemple #10
0
    def parse(self, response):
        datetime = int(str(int(time.time()*100))) #Don't change!
        random.seed(1412112 + datetime) #Don't change!

        item = NuyolkItem() #Don't change!
        item['prod_id'] = str(datetime) + str(int(random.uniform(100000, 999999))) #Don't change!

        item['affiliate_partner'] = "viglink"
        item['brand'] = "Saks Off 5th"
        item['long_desc'] = " | ".join(response.selector.xpath('//div[@itemprop="description"]/ul/li/text()').extract())
        item['short_desc'] = response.selector.xpath('//div[@class="pdt-short-desc o5-product-short-decription"]/span/text()').extract()[0]
        item['product_link'] = response.selector.xpath('//head/link[@rel="canonical"]/@href').extract()[0]

        item['cat_1'] = ""
        item['cat_2'] = ""
        item['cat_3'] = ""
        item['cat_code'] = ""

        item['date_added'] = [unicode(str(time.strftime("%d/%m/%Y %H:%M:%S")), "utf-8")]
        item['date_last_updated'] = [unicode(str(time.strftime("%d/%m/%Y %H:%M:%S")), "utf-8")]

        item['image_urls'] = ""
        item['img_1'] = ""
        item['img_2'] = ""
        item['img_3'] = ""
        item['img_4'] = ""
        item['img_5'] = ""

        try:
            item['imglink_1'] = "http://image.s5a.com/is/image/saksoff5th/" + response.selector.xpath('//div[@id="js-product-number"]/@data-master-sku').extract()[0] + "_247x329.jpg"
        except IndexError:
            item['imglink_1'] = ""

        try:
            item['imglink_2'] = "http://image.s5a.com/is/image/saksoff5th/" + response.selector.xpath('//div[@id="js-product-number"]/@data-master-sku').extract()[0] + "A1_247x329.jpg"
        except IndexError:
            item['imglink_2'] = ""

        try:
            item['imglink_3'] = "http://image.s5a.com/is/image/saksoff5th/" + response.selector.xpath('//div[@id="js-product-number"]/@data-master-sku').extract()[0] + "A2_247x329.jpg"
        except IndexError:
            item['imglink_3'] = ""

        try:
            item['imglink_4'] = "http://image.s5a.com/is/image/saksoff5th/" + response.selector.xpath('//div[@id="js-product-number"]/@data-master-sku').extract()[0] + "A3_247x329.jpg"
        except IndexError:
            item['imglink_4'] = ""

        try:
            item['imglink_5'] = "http://image.s5a.com/is/image/saksoff5th/" + response.selector.xpath('//div[@id="js-product-number"]/@data-master-sku').extract()[0] + "A4_247x329.jpg"
        except IndexError:
            item['imglink_5'] = ""

        try:
            item['imglink_6'] = "http://image.s5a.com/is/image/saksoff5th/" + response.selector.xpath('//div[@id="js-product-number"]/@data-master-sku').extract()[0] + "A5_247x329.jpg"
        except IndexError:
            item['imglink_6'] = ""

        item['mcat_1'] = ""
        item['mcat_2'] = ""
        item['mcat_3'] = ""
        item['mcat_4'] = ""
        item['mcat_5'] = ""
        item['mcat_code'] = ""

        item['merchant'] = "Saks Off 5th"
        item['merchant_id']  = "E78883"
        item['merchant_prod_id'] = response.selector.xpath('//div[@id="js-product-number"]/@data-master-sku').extract()[0]

        item['is_available'] = 'True' #BOOLEAN
        item['currency'] = "USD"
        item['currency_symbol'] = "$"

        try:
            if (int(float(response.selector.xpath('//span[@class="o5-price-standard"]/text()').extract()[0][1:])) != int(float(response.selector.xpath('//span[@class="price-sales o5-price-sales"]/text()').extract()[0][1:]))):
                orig = int(float(response.selector.xpath('//span[@class="o5-price-standard"]/text()').extract()[0][1:]))
                sale = int(float(response.selector.xpath('//span[@class="price-sales o5-price-sales"]/text()').extract()[0][1:]))
                item['price_orig'] = orig
                item['price_sale'] = sale
                item['price_perc_discount'] = int(100-100*(sale/orig))
                item['price'] = item['price_sale']
                item['on_sale'] = 'True' #BOOLEAN
            else:
                item['price_orig'] = int(float(response.selector.xpath('//span[@class="o5-price-standard"]/text()').extract()[0][1:]))
                item['price'] = item['price_orig']
                item['price_sale'] = ""
                item['on_sale'] = 'False'
        except IndexError:
            item['price_orig'] = int(float(response.selector.xpath('//span[@class="o5-price-standard"]/text()').extract()[0][1:]))
            item['price'] = item['price_orig']
            item['price_sale'] = ""
            item['on_sale'] = 'False' #BOOLEAN

        item['primary_color'] = ""

        tags = [str(item['brand']), str(item['short_desc']), str(item['long_desc'])] #str(" ".join(item['mcats'])),
        item['tags'] = " ".join(tags)

        yield item
Exemple #11
0
 def parse(self, response):
     time.sleep(0.5)
     datetime = int(str(int(time.time() * 100)))
     random.seed(1412112 + datetime)
     item = NuyolkItem()
     item['is_available'] = True
     item['affiliate_partner'] = "viglink"
     item['prod_id'] = str(
         str(datetime) + str(int(random.uniform(100000, 999999))))
     item['product_link'] = response.url
     item['merchant'] = "Burke Decor"
     try:
         item['merchant_prod_id'] = response.selector.xpath(
             '//*[@class="product-status"]/text()').extract()[0].replace(
                 "SKU: ", "").strip()
     except:
         return
     #item['upc'] ##TODO
     item['merchant_id'] = "A82I78"
     try:
         item['brand'] = response.selector.xpath(
             '//*[@class="product_meta"]//a/text()').extract()[0]
     except:
         item['brand'] = ""
     item['short_desc'] = response.selector.xpath(
         '//*[@itemprop="name"]/@content').extract()[0]
     try:
         ld = [
             response.selector.xpath(
                 '//p[@itemprop="description"]/following::p/text()').
             extract()[0]
         ]
         if ld == [u'\xa0']:
             ld = []
         ld2 = response.selector.xpath(
             '//p[@itemprop="description"]/following::ul[1]//text()'
         ).extract()
         ld2 = filter(lambda x: "%" in x or "Finish" in x, ld2)
         ld.extend(ld2)
         skipwords = [
             "clean", "instructions", "cm", "wash", "in.", "inch", "size",
             "mm ", "size", "Weight", "Dimensions"
         ]
         for w in skipwords:
             ld = list(np.array(ld)[np.array([w not in x for x in ld])])
             item['long_desc'] = " | ".join(ld).strip()
     except:
         return  ##OOS
     item['primary_color'] = ""  #later
     item['currency'] = response.selector.xpath(
         '//meta[@itemprop="priceCurrency"]/@content').extract()[0]
     if (item['currency'] == 'USD'):
         item['currency_symbol'] = '$'
     else:
         item['currency_symbol'] = '?'  ##TODO
     #If item is on sale,
     #[4:].replace(",", "")
     try:
         item['price_sale'] = int(
             float(
                 response.selector.xpath('//*[@id="ProductPrice"]/text()').
                 extract()[0].strip()[1:]))
         item['price_orig'] = int(
             float(
                 response.selector.xpath('//*[@id="ComparePrice"]/text()').
                 extract()[0].strip()[1:].replace(",", "")))
         item['price_perc_discount'] = int(
             (1 - float(item['price_sale']) / float(item['price_orig'])) *
             100)
         item['price'] = item['price_sale']
         item['on_sale'] = True
     except:
         item['price_orig'] = int(
             float(
                 response.selector.xpath('//*[@id="ProductPrice"]/text()').
                 extract()[0].strip()[1:].replace(",", "")))
         item['price'] = item['price_orig']
         item['price_sale'] = item['price_orig']
         item['price_perc_discount'] = 0
         item['on_sale'] = False
     item['image_urls'] = response.selector.xpath(
         '//*[@class="product-media"]//img//@src').extract()
     item['image_urls'] = [
         'http:' + x.split('?v=', 1)[0] for x in item['image_urls']
     ]
     #response.selector.xpath('//*[@class="zoom masterTooltip"]/img/@src').extract() #new
     item['img_1'] = ""
     item['img_2'] = ""
     item['img_3'] = ""
     item['img_4'] = ""
     item['img_5'] = ""
     for i in range(0, 6):
         attr = 'imglink_' + str(i + 1)
         try:
             item[attr] = item['image_urls'][i]
         except:
             item[attr] = ""
     mcats = response.xpath('//script[contains(., "fbq(")]/text()').re(
         'content_category\: \'([^]]+)')
     mcats = mcats[0].split(",")[0]
     mcats = mcats.split(" > ")
     mcats = filter(
         lambda x: "All" not in x and "New" not in x and "$" not in x and
         item['brand'] not in x and "Sale" not in x and "Shop" not in x,
         mcats)
     item['mcat_code'] = ""
     item['image_urls'] = ""
     for i in range(0, 5):
         attr = 'mcat_' + str(i + 1)
         try:
             if i == len(mcats) - 1:
                 item[attr] = ""
             else:
                 item[attr] = mcats[i]
         except:
             item[attr] = ""
     item['cat_code'] = ""
     item['cat_1'] = ""  #deprecate
     item['cat_2'] = ""  #deprecate
     item['cat_3'] = ""  #deprecate
     t = [
         item['brand'], item['short_desc'], item['mcat_1'], mcats[1:],
         item['long_desc']
     ]
     item['tags'] = " ".join(list(numpy.hstack(t)))
     item['date_added'] = str(time.strftime("%d/%m/%Y %H:%M:%S"))
     item['date_last_updated'] = str(time.strftime("%d/%m/%Y %H:%M:%S"))
     yield item
Exemple #12
0
    def parse(self, response):
        def find_between(s, first, last):
            try:
                start = s.index(first) + len(first)
                end = s.index(last, start)
                return s[start:end]
            except ValueError:
                return ""

        datetime = int(str(int(time.time() * 100)))  #Don't change!
        random.seed(1412112 + datetime)  #Don't change!
        item = NuyolkItem()  #Don't change!
        item['prod_id'] = str(datetime) + str(
            int(random.uniform(100000, 999999)))  #Don't change!
        item['affiliate_partner'] = "viglink"
        #item['brand'] = response.selector.xpath('//div[@id = "productTabs"]/div[@id="ctl00_ContentMainPage_brandInfoPanel"]/a[1]/strong/text()').extract()[0]
        item['brand'] = response.xpath('//title/text()').extract_first().split(
            ' | ')[0]
        descs = response.selector.xpath(
            '//div[@class="product-description"]/span//text()').extract()
        descs = list(filter(lambda a: a != '    ', descs))
        skipwords = [
            "clean", "instructions", "cm", "wash", "in.", "inch", "size",
            "mm ", "size"
        ]
        for w in skipwords:
            descs = list(
                np.array(descs)[np.array([w not in x for x in descs])])
        item['long_desc'] = "".join(descs[0:3]) + " | " + " | ".join(
            descs[3:len(descs)])
        #item['long_desc'] = " | ".join(response.selector.xpath('//div[@id="ctl00_ContentMainPage_productInfoPanel"]/ul/li/text()').extract())
        #item['short_desc'] = response.selector.xpath('//div[@class="title"]/h1/span[@class="product_title"]/text()').extract()[0]
        item['short_desc'] = response.selector.xpath(
            '//div[@class="product-hero"]//h1/text()').extract()[0]
        item['product_link'] = response.selector.xpath(
            '//head/link[@rel="canonical"]/@href').extract()[0]
        item['cat_1'] = ""
        item['cat_2'] = ""
        item['cat_3'] = ""
        item['cat_code'] = ""
        item['date_added'] = str(time.strftime("%d/%m/%Y %H:%M:%S"))
        item['date_last_updated'] = str(time.strftime("%d/%m/%Y %H:%M:%S"))
        item['image_urls'] = ""
        item['img_1'] = ""
        item['img_2'] = ""
        item['img_3'] = ""
        item['img_4'] = ""
        item['img_5'] = ""
        try:
            item['imglink_1'] = response.selector.xpath(
                '//div[@class="product-gallery"]//ul/li[1]/img/@src').extract(
                )[0]
        except IndexError:
            item['imglink_1'] = ""
        try:
            item['imglink_2'] = response.selector.xpath(
                '//div[@class="product-gallery"]//ul/li[2]//img/@src').extract(
                )[0]
            item['imglink_2'] = item['imglink_2'].replace(
                "S$&wid=40", "XXL$&wid=513")
        except IndexError:
            item['imglink_2'] = ""
        try:
            item['imglink_3'] = response.selector.xpath(
                '//div[@class="product-gallery"]//ul/li[3]//img/@src').extract(
                )[0]
            item['imglink_3'] = item['imglink_3'].replace(
                "S$&wid=40", "XXL$&wid=513")
        except IndexError:
            item['imglink_3'] = ""
        try:
            item['imglink_4'] = response.selector.xpath(
                '//div[@class="product-gallery"]//ul/li[4]//img/@src').extract(
                )[0]
            item['imglink_4'] = item['imglink_4'].replace(
                "S$&wid=40", "XXL$&wid=513")
        except IndexError:
            item['imglink_4'] = ""
        try:
            item['imglink_5'] = response.selector.xpath(
                '//div[@class="product-gallery"]//ul/li[5]//img/@src').extract(
                )[0]
            item['imglink_5'] = item['imglink_5'].replace(
                "S$&wid=40", "XXL$&wid=513")
        except IndexError:
            item['imglink_5'] = ""
        try:
            item['imglink_6'] = response.selector.xpath(
                '//div[@class="product-gallery"]//ul/li[6]//img/@src').extract(
                )[0]
            item['imglink_6'] = item['imglink_6'].replace(
                "S$&wid=40", "XXL$&wid=513")
        except IndexError:
            item['imglink_6'] = ""
        mcats = response.selector.xpath(
            '//*[@id="more-from"]/descendant::a/text()').extract()

        for i in range(0, 5):
            attr = 'mcat_' + str(i + 1)
            try:
                item[attr] = mcats[i]
            except:
                item[attr] = ""
        item['mcat_code'] = ""
        item['merchant'] = "ASOS US"
        item['merchant_id'] = "IU95X3"
        item['merchant_prod_id'] = str(
            response.selector.xpath(
                '//*[@class="product-code"]//span/text()').extract()[0])
        item['is_available'] = True  #BOOLEAN
        p = "\n".join(
            response.selector.xpath(
                '//script[contains(., "current")]/text()').extract())
        item['currency'] = find_between(p, '"currency":"', '",')[0:3]
        if (item['currency'] == 'USD'):
            item['currency_symbol'] = '$'
        else:
            item['currency_symbol'] = '?'
        item['price'] = int(float(find_between(p, '"current":', ",")))
        prev = float(find_between(p, '"previous":', ","))
        rrp = float(find_between(p, '"rrp":', ","))
        if (prev == 0 and rrp == 0):
            item['price_orig'] = item['price']
            item['price_sale'] = item['price']
            item['price_perc_discount'] = 0
            item['on_sale'] = False
        else:
            item['price_sale'] = item['price']
            if (prev > 0):
                item['price_orig'] = int(prev)
            elif (rrp > 0):
                item['price_orig'] = int(rrp)
            else:
                item['price_orig'] = int(0)  ###TODO ???
            item['on_sale'] = True
            item['price_perc_discount'] = int(
                100 - 100 * (item['price_sale'] / item['price_orig']))
        item['primary_color'] = ""
        tags = [
            str(item['brand']),
            str(item['short_desc']),
            str(item['long_desc'])
        ]  #str(" ".join(item['mcats'])),
        item['tags'] = " ".join(tags)
        yield item
Exemple #13
0
    def parse(self, response):
        datetime = int(str(int(time.time() * 100)))  #Don't change!
        random.seed(1412112 + datetime)  #Don't change!
        item = NuyolkItem()  #Don't change!
        item['prod_id'] = str(datetime) + str(
            int(random.uniform(100000, 999999)))  #Don't change!
        item['affiliate_partner'] = "viglink"
        try:
            item['brand'] = response.selector.xpath(
                '//meta[@name="twitter:data2"]/@content').extract()[0]
        except:
            return
        ld = response.selector.xpath(
            '//div[@class="product-details__content js-tabs__content js-tabs__content-active product-details__description"]/ul/li/text()'
        ).extract()
        if (len(ld) >= 7):
            ld = ld[:7]
        ld = filter(lambda x: "Style No." not in x and " cm" not in x, ld)
        item['long_desc'] = " | ".join(ld)
        item['short_desc'] = response.selector.xpath(
            '//*[@class="product-titles"]//h1/text()').extract()[0].strip()
        item['product_link'] = response.selector.xpath(
            '//head/link[@rel="canonical"]/@href').extract()[0]

        item['cat_1'] = ""
        item['cat_2'] = ""
        item['cat_3'] = ""
        item['cat_code'] = ""

        item['date_added'] = str(time.strftime("%d/%m/%Y %H:%M:%S"))
        item['date_last_updated'] = str(time.strftime("%d/%m/%Y %H:%M:%S"))

        item['image_urls'] = ""
        item['img_1'] = ""
        item['img_2'] = ""
        item['img_3'] = ""
        item['img_4'] = ""
        item['img_5'] = ""

        try:
            item['imglink_1'] = response.selector.xpath(
                '//div[@id="js-primary-slideshow__pager"]/a[1]/@data-image'
            ).extract()[0]
        except IndexError:
            item['imglink_1'] = ""

        try:
            item['imglink_2'] = response.selector.xpath(
                '//div[@id="js-primary-slideshow__pager"]/a[2]/@data-image'
            ).extract()[0]
        except IndexError:
            item['imglink_2'] = ""

        try:
            item['imglink_3'] = response.selector.xpath(
                '//div[@id="js-primary-slideshow__pager"]/a[3]/@data-image'
            ).extract()[0]
        except IndexError:
            item['imglink_3'] = ""

        try:
            item['imglink_4'] = response.selector.xpath(
                '//div[@id="js-primary-slideshow__pager"]/a[4]/@data-image'
            ).extract()[0]
        except IndexError:
            item['imglink_4'] = ""

        try:
            item['imglink_5'] = response.selector.xpath(
                '//div[@id="js-primary-slideshow__pager"]/a[5]/@data-image'
            ).extract()[0]
        except IndexError:
            item['imglink_5'] = ""

        try:
            item['imglink_6'] = response.selector.xpath(
                '//div[@id="js-primary-slideshow__pager"]/a[6]/@data-image'
            ).extract()[0]
        except IndexError:
            item['imglink_6'] = ""
        mcats = response.selector.xpath(
            '//*[@class="pdp_lower_area"]/div[5]//li//text()').extract()[1:]
        mcats = [x.strip() for x in mcats]
        mcats = filter(
            lambda x: x != "" and x != item['brand'] and "REVOLVE" not in x,
            mcats)
        for i in range(0, 5):
            attr = 'mcat_' + str(i + 1)
            try:
                if i == len(mcats) - 1:
                    item[attr] = ""
                else:
                    item[attr] = mcats[i]
            except:
                item[attr] = ""
        item['mcat_code'] = ""
        item['merchant'] = "REVOLVE"
        item['merchant_id'] = "35KQ17"
        item['merchant_prod_id'] = response.selector.xpath(
            '//input[@id="productCode"]/@value').extract()[0]
        item['is_available'] = True
        item['currency'] = response.selector.xpath(
            '//meta[@property="wanelo:product:price:currency"]/@content'
        ).extract()[0]
        if (item['currency'] == 'USD'):
            item['currency_symbol'] = '$'
        else:
            item['currency_symbol'] = '?'  ##TODO
        try:
            sale = int(
                float(
                    response.selector.xpath(
                        '//div[@class="prices__retail--strikethrough"]/preceding::div/text()'
                    ).extract()[-1][2:].replace(',', '')))
            orig = int(
                float(
                    response.selector.xpath(
                        '//div[@class="prices__retail--strikethrough"]//text()'
                    ).extract()[0][2:].replace(',', '')))
            if (orig != sale):
                item['price_orig'] = orig
                item['price_sale'] = sale
                item['price_perc_discount'] = int(100 - 100 *
                                                  (float(sale) / float(orig)))
                item['price'] = item['price_sale']
                item['on_sale'] = True
            else:
                item['price_orig'] = orig
                item['price'] = item['price_orig']
                item['price_sale'] = item['price_orig']
                item['price_perc_discount'] = 0
                item['on_sale'] = False
        except IndexError:
            try:
                item['price_orig'] = int(
                    float(
                        response.selector.xpath(
                            '//meta[@itemprop="price"]/@content').extract()
                        [0]))
            except:
                return
            item['price'] = item['price_orig']
            item['price_sale'] = item['price_orig']
            item['on_sale'] = False  #BOOLEAN
            item['price_perc_discount'] = 0
        item['primary_color'] = ""
        t = [
            item['brand'], item['short_desc'], item['mcat_1'], mcats[1:],
            item['long_desc']
        ]
        item['tags'] = " ".join(list(numpy.hstack(t)))
        yield item
Exemple #14
0
 def parse(self, response):
     datetime = int(str(int(time.time() * 100)))  #Don't change!
     random.seed(1412112 + datetime)  #Don't change!
     item = NuyolkItem()  #Don't change!
     try:
         item['brand'] = response.selector.xpath(
             '//span[@class="brand"]/a/text()').extract()[0]
         item['cat_code'] = ""
         item['cat_1'] = ""  #deprecate
         item['cat_2'] = ""  #deprecate
         item['cat_3'] = ""  #deprecate
         item['currency'] = str(
             response.selector.xpath(
                 '//*[@class="translateFlag"]/a/span/text()').extract()[0])
         response.selector.path('//*[@id="currencyLink"]/span')
         item['date_added'] = str(time.strftime("%d/%m/%Y %H:%M:%S"))
         images = []
         images.append(
             response.selector.xpath(
                 '//div[@class="productImage"]//img/@data-rvsrc').extract()
             [0])
         if images[0] == []:
             images = []
             images.append(
                 response.selector.xpath(
                     '//*[@class="pImgWrap"]/img/@src').extract())
         #item['image_urls'] = images
         item['img_1'] = ""
         item['img_2'] = ""
         item['img_3'] = ""
         item['img_4'] = ""
         item['img_5'] = ""
         long_desc = response.selector.xpath(
             '//div[@class="selfridgesSaysInner"]/div/p[@class="hiddenDescription"]/text()'
         ).extract()[0].strip()  ##encoding problem
         item['long_desc'] = long_desc.replace("<b>",
                                               "").replace("</b>", "")
         item['mcats'] = ""  #later #Do NLP predictions
         item['mcat_1'] = ""  #later #Do NLP predictions
         item['mcat_2'] = ""  #later #Do NLP predictions
         item['mcat_3'] = ""  #later #Do NLP predictions
         item['mcat_4'] = ""  #later #Do NLP predictions
         item['mcat_5'] = ""  #later #Do NLP predictions
         item['merchant_id'] = "TO663Y"
         item['merchant_prod_id'] = str(
             response.selector.xpath(
                 '//p[@class="pcode"]/span[@class="val"]/text()').extract()
             [0].strip())
         try:
             orig = int(
                 float(
                     str(
                         response.selector.xpath(
                             '//p[@class="wasPrice"]/text()').extract()
                         [0]).strip().replace(",", "")[1:]))
             sale = int(
                 float(
                     str(
                         response.selector.xpath(
                             '//p[@class="price red"]/span[2]/text()').
                         extract()[0]).strip().replace(",", "")))
             if (orig != sale):
                 item['price_orig'] = orig
                 item['price_sale'] = sale
                 item['price_perc_discount'] = int(100 - 100 * sale / orig)
                 item['price'] = sale
             else:
                 item['price_orig'] = orig
                 item['price'] = orig
         except IndexError:
             item['price_orig'] = int(
                 float(
                     str(
                         response.selector.xpath(
                             '//p[@class="price"]/span[2]/text()').extract(
                             )[0]).strip().replace(",", "")))
             item['price'] = item['price_orig']
         item['primary_color'] = ""  #later
         item['prod_id'] = int(
             str(datetime) +
             str(int(random.uniform(100000, 999999))))  #Don't change!
         item['product_link'] = response.selector.xpath(
             '//link[@rel="canonical"]/@href').extract()[0]
         item['short_desc'] = str(
             response.selector.xpath('//head/title/text()').extract()[0])
         tags = [str(item['brand']), item['short_desc'],
                 item['long_desc']]  #str(" ".join(item['mcats'])),
         item['tags'] = " ".join(tags)
         item['date_last_updated'] = str(time.strftime("%d/%m/%Y %H:%M:%S"))
         item['merchant'] = 'Selfridges'
         item['imglinks'] = images
         for i in range(0, 6):
             attr = 'imglink_' + str(i + 1)
             if i < len(item['imglinks']):
                 item[attr] = str(item['imglinks'][i])
             else:
                 item[attr] = ""
         item['is_available'] = True  #Don't change! #Fix later!
         item['affiliate_partner'] = "viglink"
         yield item
     except Exception as e:
         return
Exemple #15
0
    def parse(self, response):
        datetime = int(str(int(time.time() * 100)))  #Don't change!
        random.seed(1412112 + datetime)  #Don't change!

        item = NuyolkItem()  #Don't change!
        item['prod_id'] = str(datetime) + str(
            int(random.uniform(100000, 999999)))  #Don't change!

        item['affiliate_partner'] = "viglink"
        item['brand'] = "Harrods"
        item['brand'].encode('utf-8', 'ignore')
        try:
            item['long_desc'] = response.selector.xpath(
                '//p[@class="description"]/text()').extract()[0]
            item['long_desc'].encode('utf-8', 'ignore')
        except IndexError:
            item['long_desc'] = ''
        try:
            item['short_desc'] = response.selector.xpath(
                '//span[@class="productname"]/text()').extract()[0].strip()
            item['short_desc'].encode('utf-8', 'ignore')
        except IndexError:
            return
        item['product_link'] = response.selector.xpath(
            '//head/link[@rel="canonical"]/@href').extract()[0]

        item['cat_1'] = ""
        item['cat_2'] = ""
        item['cat_3'] = ""
        item['cat_code'] = ""

        item['date_added'] = unicode(str(time.strftime("%d/%m/%Y %H:%M:%S")),
                                     "utf-8")
        item['date_last_updated'] = unicode(
            str(time.strftime("%d/%m/%Y %H:%M:%S")), "utf-8")

        item['image_urls'] = ""
        item['img_1'] = ""
        item['img_2'] = ""
        item['img_3'] = ""
        item['img_4'] = ""
        item['img_5'] = ""

        try:
            item['imglink_1'] = response.selector.xpath(
                '//ul[@class="alt_view"]/li[1]/a/@href').extract()[0]
        except IndexError:
            item['imglink_1'] = ""

        try:
            item['imglink_2'] = response.selector.xpath(
                '//ul[@class="alt_view"]/li[2]/a/@href').extract()[0]
        except IndexError:
            item['imglink_2'] = ""

        try:
            item['imglink_3'] = response.selector.xpath(
                '//ul[@class="alt_view"]/li[3]/a/@href').extract()[0]
        except IndexError:
            item['imglink_3'] = ""

        try:
            item['imglink_4'] = response.selector.xpath(
                '//ul[@class="alt_view"]/li[4]/a/@href').extract()[0]
        except IndexError:
            item['imglink_4'] = ""

        try:
            item['imglink_5'] = response.selector.xpath(
                '//ul[@class="alt_view"]/li[5]/a/@href').extract()[0]
        except IndexError:
            item['imglink_5'] = ""

        try:
            item['imglink_6'] = response.selector.xpath(
                '//ul[@class="alt_view"]/li[6]/a/@href').extract()[0]
        except IndexError:
            item['imglink_6'] = ""

        item['mcat_1'] = ""
        item['mcat_2'] = ""
        item['mcat_3'] = ""
        item['mcat_4'] = ""
        item['mcat_5'] = ""
        item['mcat_code'] = ""

        item['merchant'] = "Harrods"
        item['merchant_id'] = "2GSE52"
        item['merchant_prod_id'] = response.selector.xpath(
            '//span[@class="product_code"]/text()').extract()[0][13:]

        item['is_available'] = 'True'  #BOOLEAN
        item['currency'] = response.selector.xpath(
            '//span[@class="country-selector_currency"]/text()').extract()[0]
        item['currency_symbol'] = response.selector.xpath(
            '//span[@class="country-selector_currency"]/span[@class="code"]/text()'
        ).extract()[0]

        item['price'] = int(
            float(
                response.selector.xpath(
                    '//span[@class="prices price"]/span/span/text()').extract(
                    )[0][1:]))
        item['price_orig'] = int(
            float(
                response.selector.xpath(
                    '//span[@class="prices price"]/span/span/text()').extract(
                    )[0][1:]))
        item['price_sale'] = int(
            float(
                response.selector.xpath(
                    '//span[@class="prices price"]/span/span/text()').extract(
                    )[0][1:]))
        item['price_perc_discount'] = 0
        item['on_sale'] = 'False'
        '''
        try:
            if (int(float(response.selector.xpath('//span[@class="prices price"]/span[@class="was"]/text()').extract()[0][1:])) != int(float(response.selector.xpath('//span[@class="prices price"]/span[@class="now"]/text()').extract()[0][5:]))):
                orig = int(float(response.selector.xpath('//span[@class="prices price"]/span[@class="was"]/text()').extract()[0][1:]))
                sale = int(float(response.selector.xpath('//span[@class="prices price"]/span[@class="now"]/text()').extract()[0][5:]))
                item['price_orig'] = orig
                item['price_sale'] = sale
                item['price_perc_discount'] = int(100-100*(sale/orig))
                item['price'] = item['price_sale']
                item['on_sale'] = 'True' #BOOLEAN
            else:
                item['price_orig'] = int(float(response.selector.xpath('//span[@class="prices price"]/span[@class="was"]/text()').extract()[0][1:]))
                item['price'] = item['price_orig']
                item['price_sale'] = ""
                item['on_sale'] = 'False'
        except IndexError:
            item['price_orig'] = int(float(response.selector.xpath('//dd[@class="product-pricing__price"]/span[@itemprop="price"]/text()').extract()[0]))
            item['price'] = item['price_orig']
            item['price_sale'] = ""
            item['on_sale'] = 'False' #BOOLEAN
        '''
        item['primary_color'] = ""

        tags = [
            str(item['brand']),
            str(item['short_desc']),
            str(item['long_desc'])
        ]  #str(" ".join(item['mcats'])),
        item['tags'] = " ".join(tags)

        yield item