def parse(self, response): datetime = int(str(int(time.time()*100))) random.seed(1412112 + datetime) item = NuyolkItem() item['prod_id'] = int(str(datetime) + str(int(random.uniform(100000, 999999)))) item['product_link'] = response.selector.xpath('/html/head/meta[12]/@content').extract()[0] item['merchant_prod_id'] = response.selector.xpath('/html/head/meta[17]/@content').extract()[0] item['merchant_id'] = "70856L" item['brand'] = response.selector.xpath('//h1[@class="brand"]/a/text()').extract()[0] item['short_desc'] = response.selector.xpath('//h1[@class="product-name"]/text()').extract()[0] item['long_desc'] = response.selector.xpath('/html/head/meta[4]/@content').extract()[0] item['primary_color'] = "" #later item['currency'] = response.selector.xpath('/html/head/meta[19]/@content').extract()[0] #If item is on sale, if (response.selector.xpath("//span[@class='price-standard']/text()").extract() != []): item['price_orig'] = response.selector.xpath("//span[@class='price-standard']/text()").extract()[0][1:] item['price_sale'] = response.selector.xpath("//span[@class='price-sales']/text()").extract()[0][1:] item['price_perc_discount'] = int((1 - float(item['price_sale'])/float(item['price_orig']))*100) item['price'] = item['price_sale'] else: item['price_orig'] = response.selector.xpath("//span[@class='price-sales']/text()").extract()[0][1:] item['price'] = item['price_orig'] item['image_urls'] = response.selector.xpath('//*[@class="zoom masterTooltip"]/img/@src').extract() #new item['img_1'] = "" item['img_2'] = "" item['img_3'] = "" item['img_4'] = "" item['img_5'] = "" #new item['mcats'] = response.selector.xpath('//*[@id="main"]/div/div/ol/li/a/text()').extract() for i in range(0, len(item['mcats'])): attr = 'mcat_' + str(i+1) item[attr] = item['mcats'][i] item['cat_code'] = "" item['cat_1'] = "" #deprecate item['cat_2'] = "" #deprecate item['cat_3'] = "" #deprecate tags = [str(response.selector.xpath('//h1[@class="brand"]/a/text()').extract()[0]), str(response.selector.xpath('//h1[@class="product-name"]/text()').extract()[0]), str(" ".join(item['mcats'])), str(response.selector.xpath('/html/head/meta[4]/@content').extract()[0])] item['tags'] = " ".join(tags) item['date_added'] = [unicode(str(time.strftime("%d/%m/%Y %H:%M:%S")), "utf-8")] yield item
def parse(self, response): item = NuyolkItem() #Don't change! try: item['brand'] = str( response.selector.xpath( '//*[@class="productDesc"]//p[@itemprop="brand"]/a/text()' ).extract()[0]).strip() mcats = response.selector.xpath( '//nav[@id="breadcrumb"]//li/a/text()').extract() item['mcats'] = mcats[1:len(mcats)] item['merchant_prod_id'] = str( response.selector.xpath( '//form/input[@name="productId"]/@value').extract()[0]) item['product_link'] = str( response.selector.xpath( '//*[@id="canonicalUrl"]/@href').extract()[0]) yield item except Exception as e: return
def parse(self, response): datetime = int(str(int(time.time()*100))) random.seed(1412112 + datetime) item = NuyolkItem() item['is_available'] = True item['affiliate_partner'] = "viglink" item['prod_id'] = str(str(datetime) + str(int(random.uniform(100000, 999999)))) item['product_link'] = response.url item['merchant'] = "Lamps Plus" try: item['merchant_prod_id'] = response.selector.xpath('//*[@id="pdProdSku"]/text()').extract()[0].replace('- Style # ', '') except: return item['merchant_id'] = "P2B2J5" try: item['brand'] = response.selector.xpath('//*[@id="pnlBrand"]/@content').extract()[0] except: item['brand'] = "" try: item['short_desc'] = response.selector.xpath('//*[@id="h1ProductName"]/text()').extract()[0].strip() except: return ld = [response.selector.xpath('//*[@id="pdKeySentence"]/text()').extract()[0].strip()] ld2 = [response.selector.xpath('//p[@itemprop="description"]/text()').extract()[0].strip()] ld3 = response.selector.xpath('//*[@id="pdDescBullets"]/li/text()').extract() ld.extend(ld2) ld.extend(ld3) skipwords = ["clean", "instructions", "cm", "\" ", "wash", "in.", "inch", "size", "mm ", "size", "weighs", "lbs."] for w in skipwords: ld = list(np.array(ld)[np.array([w not in x for x in ld])]) item['long_desc'] = " | ".join(ld).strip() item['primary_color'] = "" #later item['currency'] = response.selector.xpath('//meta[@itemprop="priceCurrency"]/@content').extract()[0] if (item['currency'] == 'USD'): item['currency_symbol'] = '$' else: item['currency_symbol'] = '?' ##TODO #If item is on sale, #[4:].replace(",", "") try: item['price_sale'] = int(float(response.selector.xpath("//*[@itemprop='lowPrice']/@content").extract()[0].replace(",", ""))) item['price_orig'] = int(float(response.selector.xpath("//*[@itemprop='highPrice']/@content").extract()[0].replace(",", ""))) item['price_perc_discount'] = int((1 - float(item['price_sale'])/float(item['price_orig']))*100) item['price'] = item['price_sale'] item['on_sale'] = True except: item['price_orig'] = int(float(response.selector.xpath("//*[@itemprop='price']/@content").extract()[0].replace(",", ""))) item['price'] = item['price_orig'] item['price_sale'] = item['price_orig'] item['price_perc_discount'] = 0 item['on_sale'] = False imgs = response.selector.xpath('//*[@id="pdAddlImgs"]//img/@src').extract() item['image_urls'] = [x.replace(find_between(x, 'fpx?', 'fmt=jpeg'), "") for x in imgs] #response.selector.xpath('//*[@class="zoom masterTooltip"]/img/@src').extract() #new item['img_1'] = "" item['img_2'] = "" item['img_3'] = "" item['img_4'] = "" item['img_5'] = "" for i in range(0,6): attr = 'imglink_' + str(i+1) try: item[attr] = item['image_urls'][i] except: item[attr] = "" mcats = response.selector.xpath('//*[@id="divBreadCrumb"]//text()').extract() mcats = [x.strip() for x in mcats] mcats = filter(lambda x: x != "" and x != "|", mcats) mcats = mcats[1:-2] item['mcat_code'] = "" item['image_urls'] = "" for i in range(0, 5): attr = 'mcat_' + str(i + 1) try: if i == len(mcats) - 1: item[attr] = "" else: item[attr] = mcats[i] except: item[attr] = "" item['cat_code'] = "" item['cat_1'] = "" #deprecate item['cat_2'] = "" #deprecate item['cat_3'] = "" #deprecate t = [item['brand'], item['short_desc'], item['mcat_1'], mcats[1:], item['long_desc']] item['tags'] = " ".join(list(numpy.hstack(t))) item['date_added'] = str(time.strftime("%d/%m/%Y %H:%M:%S")) item['date_last_updated'] = str(time.strftime("%d/%m/%Y %H:%M:%S")) yield item
def parse(self, response): datetime = int(str(int(time.time() * 100))) random.seed(1412112 + datetime) item = NuyolkItem() item['is_available'] = True item['affiliate_partner'] = "viglink" item['prod_id'] = str( str(datetime) + str(int(random.uniform(100000, 999999)))) item['product_link'] = response.url item['merchant'] = "Society6" try: mpi = response.xpath('//script[contains(., "dataLayer = ")]/text()' ).re('\"id\"\:\"(.*)')[0] mpi = mpi.split("\"")[0] item['merchant_prod_id'] = mpi except: pass item['merchant_id'] = "7599C0" try: brand = response.selector.xpath( '//*[@class="user-avatar"]/a/img/@alt').extract()[0] brand = brand.split(" (")[0].strip() item['brand'] = brand except: item['brand'] = "" sd = response.selector.xpath('//title/text()').extract()[0] sd = sd.split(" by ")[0].capitalize() item['short_desc'] = sd try: ld = [ response.selector.xpath( '//*[@id="about-the-art-description"]/text()').extract() [0].strip() ] except: ld = [] ld2 = response.selector.xpath('//*[@id="product-description"]//text()' ).extract()[0].strip().split(". ") ld2last = ld2[-1] ld2 = [x + "." for x in ld2[:-1]] ld2.append(ld2last) ld.extend(ld2) skipwords = [ "clean", "instructions", "cm", "\" ", "wash", "in.", "inch", "size", "mm ", "size", "weighs", "lbs." ] for w in skipwords: ld = list(np.array(ld)[np.array([w not in x for x in ld])]) item['long_desc'] = " | ".join(ld).strip() item['primary_color'] = "" #later item['currency'] = response.selector.xpath( '//meta[@property="og:price:currency"]/@content').extract()[0] if (item['currency'] == 'USD'): item['currency_symbol'] = '$' else: item['currency_symbol'] = '?' ##TODO #If item is on sale, #[4:].replace(",", "") try: #####TODO (cannot find products on sale) item['price_sale'] = int( float( response.selector.xpath( '//meta[@property="og:price:sale"]/@content').extract( )[0].replace(",", ""))) item['price_orig'] = int( float( response.selector.xpath( '//meta[@property="og:price:orig"]/@content').extract( )[0].replace(",", ""))) item['price_perc_discount'] = int( (1 - float(item['price_sale']) / float(item['price_orig'])) * 100) item['price'] = item['price_sale'] item['on_sale'] = True except: item['price_orig'] = int( float( response.selector.xpath( '//meta[@property="og:price:amount"]/@content'). extract()[0].replace(",", ""))) item['price'] = item['price_orig'] item['price_sale'] = item['price_orig'] item['price_perc_discount'] = 0 item['on_sale'] = False item['image_urls'] = response.selector.xpath( '//*[@id="product-image-main"]//img/@src').extract() #response.selector.xpath('//*[@class="zoom masterTooltip"]/img/@src').extract() #new item['img_1'] = "" item['img_2'] = "" item['img_3'] = "" item['img_4'] = "" item['img_5'] = "" for i in range(0, 6): attr = 'imglink_' + str(i + 1) try: item[attr] = item['image_urls'][i] except: item[attr] = "" mcats = response.selector.xpath( '//*[@class="breadcrumb_v2"]//span/text()').extract() mcats = filter(lambda x: x != "/", mcats) mcats = mcats[1:-1] item['mcat_code'] = "" item['image_urls'] = "" for i in range(0, 5): attr = 'mcat_' + str(i + 1) try: if i == len(mcats) - 1: item[attr] = "" else: item[attr] = mcats[i] except: item[attr] = "" item['cat_code'] = "" item['cat_1'] = "" #deprecate item['cat_2'] = "" #deprecate item['cat_3'] = "" #deprecate t = [ item['brand'], item['short_desc'], item['mcat_1'], mcats[1:], item['long_desc'] ] item['tags'] = " ".join(list(numpy.hstack(t))) item['date_added'] = str(time.strftime("%d/%m/%Y %H:%M:%S")) item['date_last_updated'] = str(time.strftime("%d/%m/%Y %H:%M:%S")) yield item
def parse(self, response): try: sold_out_msg = response.selector.xpath( '//*[@class="sold-out-header"]/text()').extract()[0] print("SOLD OUT--SKIPPED!") return except: datetime = int(str(int(time.time() * 100))) random.seed(1412112 + datetime) item = NuyolkItem() item['is_available'] = True item['affiliate_partner'] = "viglink" item['prod_id'] = str(datetime) + str( int(random.uniform(100000, 999999))) item['product_link'] = response.selector.xpath( '/html/head/meta[23]/@content').extract()[0] item['merchant'] = "HBX" item['merchant_prod_id'] = response.selector.xpath( '//*[@id="product-summary"]/@data-id').extract()[0] #skipped item['merchant_id'] = "70856L" item['brand'] = response.selector.xpath( '//h1[@class="brand"]/text()').extract()[0] item['short_desc'] = response.selector.xpath( '//h1[@class="brand"]/text()').extract()[0] ld = response.selector.xpath( './/*[@class="description"]/p/text()').extract() item['long_desc'] = " | ".join(ld).strip() item['primary_color'] = "" #later item['currency'] = response.selector.xpath( '//*[@class="currency-dropdown"]/span/text()').extract()[0] if (item['currency'] == 'USD'): item['currency_symbol'] = '$' else: item['currency_symbol'] = '?' #If item is on sale, try: item['price_sale'] = int( float( response.selector.xpath( "//span[@class='sale-price']/text()").extract()[0] [4:].replace(",", ""))) item['price_orig'] = int( float( response.selector.xpath( "//span[@class='regular-price']/text()").extract() [0][4:].replace(",", ""))) item['price_perc_discount'] = int( (1 - float(item['price_sale']) / float(item['price_orig'])) * 100) item['price'] = item['price_sale'] item['on_sale'] = True except: item['price_orig'] = int( float( response.selector.xpath( "//span[@class='regular-price']/text()").extract() [0][4:].replace(",", ""))) item['price'] = item['price_orig'] item['price_sale'] = item['price_orig'] item['price_perc_discount'] = 0 item['on_sale'] = False item['image_urls'] = response.selector.xpath( './/ul[@class="slides"]/li/img/@src').extract() #response.selector.xpath('//*[@class="zoom masterTooltip"]/img/@src').extract() #new item['img_1'] = "" item['img_2'] = "" item['img_3'] = "" item['img_4'] = "" item['img_5'] = "" for i in range(0, 6): attr = 'imglink_' + str(i + 1) try: item[attr] = item['image_urls'][i] except: item[attr] = "" mcats = response.selector.xpath( './/ol[contains(@class, "breadcrumb") and contains(@class, "hidden-xs")]/li/a/text()' ).extract() mcats = [mc.strip() for mc in mcats] item['mcat_code'] = "" item['image_urls'] = "" for i in range(0, 5): attr = 'mcat_' + str(i + 1) try: if i == len(mcats) - 1: item[attr] = "" elif i == 0: if 'women' in response.url: item[attr] = 'Women' else: item[attr] = 'Men' else: item[attr] = mcats[i] except: item[attr] = "" item['cat_code'] = "" item['cat_1'] = "" #deprecate item['cat_2'] = "" #deprecate item['cat_3'] = "" #deprecate t = [ item['brand'], item['short_desc'], item['mcat_1'], mcats[1:], item['long_desc'] ] item['tags'] = " ".join(list(numpy.hstack(t))) item['date_added'] = str(time.strftime("%d/%m/%Y %H:%M:%S")) item['date_last_updated'] = str(time.strftime("%d/%m/%Y %H:%M:%S")) yield item
def parse(self, response): datetime = int(str(int(time.time() * 100))) random.seed(1412112 + datetime) item = NuyolkItem() item['is_available'] = True item['affiliate_partner'] = "viglink" item['prod_id'] = str( str(datetime) + str(int(random.uniform(100000, 999999)))) item['product_link'] = response.url item['merchant'] = "Belk" item['merchant_prod_id'] = response.url.split("/")[-1].replace( ".html", "") #item['upc'] ##TODO item['merchant_id'] = "IXR49N" try: item['brand'] = response.selector.xpath( '//*[@itemprop="brand"]/text()').extract()[0] except: item['brand'] = "" item['short_desc'] = response.selector.xpath( '//*[@class="brand-name"]/text()').extract()[0].strip() ld = response.selector.xpath( '//meta[@name="description"]/@content').extract() ld.extend( response.selector.xpath( '//ul[@class="copyline"]/li/text()').extract()) skipwords = [ "clean", "instructions", "cm", "wash", "in.", "inch", "size", "mm ", "size" ] for w in skipwords: ld = list(np.array(ld)[np.array([w not in x for x in ld])]) item['long_desc'] = " | ".join(ld).strip() item['primary_color'] = "" #later item['currency'] = response.selector.xpath( '//meta[@itemprop="priceCurrency"]/@content').extract()[0] if (item['currency'] == 'USD'): item['currency_symbol'] = '$' else: item['currency_symbol'] = '?' ##TODO #If item is on sale, #[4:].replace(",", "") try: item['price_sale'] = int( float( response.selector.xpath( "//*[@class='price-sales']/span/text()").extract() [0].replace(",", ""))) item['price_orig'] = int( float( response.selector.xpath( "//*[@class='price-standard']/text()").extract() [0].replace("Orig. $", "").replace(",", ""))) item['price_perc_discount'] = int( (1 - float(item['price_sale']) / float(item['price_orig'])) * 100) item['price'] = item['price_sale'] item['on_sale'] = True except: try: item['price_orig'] = int( float( response.selector.xpath( "//*[@class='standardprice']/input/@value"). extract()[0].replace(",", ""))) except: try: item['price_orig'] = int( float( response.selector.xpath( "//*[@class='standardprice']/span/text()"). extract()[0].replace(",", ""))) except: print("??? SKIPPED!") return item['price'] = item['price_orig'] item['price_sale'] = item['price_orig'] item['price_perc_discount'] = 0 item['on_sale'] = False item['image_urls'] = response.selector.xpath( '//div[@class="product-thumbnails"]//li/a/@href').extract() #response.selector.xpath('//*[@class="zoom masterTooltip"]/img/@src').extract() #new item['img_1'] = "" item['img_2'] = "" item['img_3'] = "" item['img_4'] = "" item['img_5'] = "" for i in range(0, 6): attr = 'imglink_' + str(i + 1) try: item[attr] = item['image_urls'][i] except: item[attr] = "" mcats = response.xpath( '//script[contains(., "var utag_data")]/text()').re( 'product_category\"\: \[([^]]+)\]')[0].strip().replace( '"', "") mcats = mcats.split(" > ") item['mcat_code'] = "" item['image_urls'] = "" for i in range(0, 5): attr = 'mcat_' + str(i + 1) try: if i == len(mcats) - 1: item[attr] = "" else: item[attr] = mcats[i] except: item[attr] = "" item['cat_code'] = "" item['cat_1'] = "" #deprecate item['cat_2'] = "" #deprecate item['cat_3'] = "" #deprecate t = [ item['brand'], item['short_desc'], item['mcat_1'], mcats[1:], item['long_desc'] ] item['tags'] = " ".join(list(numpy.hstack(t))) item['date_added'] = str(time.strftime("%d/%m/%Y %H:%M:%S")) item['date_last_updated'] = str(time.strftime("%d/%m/%Y %H:%M:%S")) yield item
def parse(self, response): datetime = int(str(int(time.time() * 100))) #Don't change! random.seed(1412112 + datetime) #Don't change! item = NuyolkItem() #Don't change! item['brand'] = response.selector.xpath( '//span[@itemprop="brand"]/a/text()').extract()[0] item['cat_code'] = "" item['cat_1'] = "" #deprecate item['cat_2'] = "" #deprecate item['cat_3'] = "" #deprecate item['currency'] = str( response.selector.xpath( '//div[@class="currency"]/span[@class="code"]/text()').extract( )[0]) if item['currency'] == 'USD': item['currency_symbol'] = '$' else: item['currency_symbol'] = '?' item['date_added'] = str(time.strftime("%d/%m/%Y %H:%M:%S")) item['date_last_updated'] = str(time.strftime("%d/%m/%Y %H:%M:%S")) #item['image_urls'] = response.selector.xpath('//ul[@id="image-carousel"]/li/a/@href').extract() item['image_urls'] = "" item['img_1'] = "" item['img_2'] = "" item['img_3'] = "" item['img_4'] = "" item['img_5'] = "" try: item['long_desc'] = response.selector.xpath( '//div[@itemprop="description"]//text()').extract()[0] except: item['long_desc'] = "" mcats = response.selector.xpath( './/ul[@class="shoptiques-breadcrumb"]/li/a/text()').extract() for i in range(0, 5): attr = 'mcat_' + str(i + 1) try: item[attr] = mcats[i] except: item[attr] = "" item['mcat_code'] = "" #later #Do NLP predictions item['merchant_id'] = "3O056R" item['merchant_prod_id'] = '' try: orig = int( float( response.selector.xpath( '//*[@id="product-detail"]//*[contains(@class, "retail")]/text()' ).extract()[0][1:])) sale = int( float( response.selector.xpath( '//*[@id="product-detail"]//*[contains(@class, "sale")]/text()' ).extract()[0][1:])) if (orig != sale): item['price_orig'] = int(orig) item['price_sale'] = int(sale) item['price_perc_discount'] = int(100 - 100 * (sale / orig)) item['on_sale'] = True item['price'] = int(item['price_sale']) else: item['price_orig'] = orig item['price'] = orig item['on_sale'] = False except: try: item['price_orig'] = int( float( response.selector.xpath( '//div[@class="product-name"]/span[@id="product-price"]/span/text()' ).extract()[0][1:])) except Exception as e: item['price_orig'] = int( float( response.selector.xpath( '//div[@class="product-name"]/span[@id="product-price"]/span/span[1]/text()' ).extract()[0][1:])) item['price'] = item['price_orig'] item['price_sale'] = item['price_orig'] item['on_sale'] = False item['price_perc_discount'] = 0 item['primary_color'] = "" #later item['prod_id'] = str(datetime) + str( int(random.uniform(100000, 999999))) #Don't change! item['product_link'] = str( response.selector.xpath( '//head/link[@rel="canonical"]/@href').extract()[0]) item['short_desc'] = str( response.selector.xpath( '//div[@id="product-detail"]/div[@class="product-name"]/h1/text()' ).extract()[0].strip()).strip().replace(" ", "") t = [ item['brand'], item['short_desc'], "Women", mcats, item['long_desc'] ] item['tags'] = " ".join(list(numpy.hstack(t))) item['imglinks'] = response.selector.xpath( '//ul[@id="image-carousel"]/li/a/@href').extract() for i in range(0, 6): attr = 'imglink_' + str(i + 1) if i < len(item['imglinks']): item[attr] = str(item['imglinks'][i]) else: item[attr] = "" item['imglinks'] = "" item['is_available'] = True #Don't change! #Fix later! item['affiliate_partner'] = "viglink" item['merchant'] = "Shoptiques" yield item
def parse(self, response): datetime = int(str(int(time.time() * 100))) random.seed(1412112 + datetime) #Don't change! item = NuyolkItem() #Don't change! try: test = str( response.selector.xpath( '//div[@id="product-old"]/form/input[@name="product"]/@value' ).extract()[0]) item['brand'] = "" #Needs post-processing! item['cat_code'] = "" item['cat_1'] = "" #deprecate item['cat_2'] = "" #deprecate item['cat_3'] = "" #deprecate curr = '//*[@id="currency-widget"]/li[' + str( len( response.selector.xpath('//*[@id="currency-widget"]/li'). extract())) + ']/a/span' curr_temp = response.selector.xpath(curr).extract()[0] item['currency'] = str(curr_temp[curr_temp.index(">") + 1:curr_temp.rindex("<")]) item['date_added'] = str(time.strftime("%d/%m/%Y %H:%M:%S")) #item['image_urls'] = response.selector.xpath('//ul[@class="thumbs"]/li/a/img/@src').extract() item['img_1'] = "" item['img_2'] = "" item['img_3'] = "" item['img_4'] = "" item['img_5'] = "" long_desc = response.selector.xpath( '//p[@class="description"]/text()').extract() #item['long_desc'] = " ".join(long_desc).replace('\n', '').replace('\t', '').replace('\r', '').replace(u"\u2022", "") item['long_desc'] = " ".join(long_desc) item['mcats'] = "" #later #Do NLP predictions item['mcat_1'] = "" item['mcat_1'] = "" item['mcat_1'] = "" item['mcat_1'] = "" item['mcat_1'] = "" item['mcat_1'] = "" item['mcat_code'] = "" item['merchant_id'] = "JOQ3F3" item['merchant_prod_id'] = str( response.selector.xpath( '//div[@id="product-old"]/form/input[@name="product"]/@value' ).extract()[0]) try: if (response.selector.xpath( "//div[@class='price-mobile']/p/span[@class='price']/text()" ).extract()[0] != response.selector.xpath( "//div[@class='price-mobile']/h3/span[@class='price']/text()" ).extract()[0]): item['price_orig'] = int( float( response.selector.xpath( "//div[@class='price-mobile']/p/span[@class='price']/text()" ).extract()[0][1:])) item['price_sale'] = int( float( response.selector.xpath( "//div[@class='price-mobile']/h3/span[@class='price']/text()" ).extract()[0][1:])) item['price_perc_discount'] = int(100 - ((float( response.selector.xpath( "//div[@class='price-mobile']/h3/span[@class='price']/text()" ).extract()[0][1:] )) / (float( response.selector.xpath( "//div[@class='price-mobile']/p/span[@class='price']/text()" ).extract()[0][1:]))) * 100) item['price'] = item['price_sale'] else: item['price_orig'] = int( float( response.selector.xpath( "//div[@class='price-mobile']/h3/span[@class='price']/text()" ).extract()[0][1:])) item['price'] = item['price_orig'] except IndexError: item['price_orig'] = int( float( response.selector.xpath( "//div[@class='price-mobile']/h3/span[@class='price']/text()" ).extract()[0][1:])) item['price'] = item['price_orig'] item['primary_color'] = "" #later item['prod_id'] = int( str(datetime) + str(int(random.uniform(100000, 999999)))) #Don't change! item['product_link'] = str( response.selector.xpath( '//link[@rel="canonical"]/@href').extract()[0]) item['short_desc'] = str( response.selector.xpath('//title/text()').extract()[0]) tags = [ str(item['brand']), str(item['short_desc']), item['long_desc'] ] #str(" ".join(item['mcats'])), item['tags'] = " ".join(tags) item['imglinks'] = response.selector.xpath( '//ul[@class="thumbs"]/li/a/img/@src').extract() for i in range(0, 6): attr = 'imglink_' + str(i + 1) if i < len(item['imglinks']): item[attr] = str(item['imglinks'][i]) else: item[attr] = "" item['is_available'] = True #Don't change! #Fix later! item['affiliate_partner'] = "viglink" yield item except Exception as e: return
def parse(self, response): def find_between(s, first, last): try: start = s.index(first) + len(first) end = s.index(last, start) return s[start:end] except ValueError: return "" datetime = int(str(int(time.time() * 100))) #Don't change! random.seed(1412112 + datetime) #Don't change! item = NuyolkItem() #Don't change! item['prod_id'] = str(datetime) + str( int(random.uniform(100000, 999999))) #Don't change! item['affiliate_partner'] = "viglink" item['brand'] = response.selector.xpath( '//a[@id="product-brand"]/text()').extract()[0] ld = [ response.selector.xpath( '//div[@id="details"]//span/text()').extract()[0] ] ld.extend( response.selector.xpath( './/div[@id="productDetail-details"]//p/text()').extract()) skipwords = [ "clean", "instructions", "cm", "wash", "in.", "inch", "size", "mm ", "size" ] for w in skipwords: ld = list(np.array(ld)[np.array([w not in x for x in ld])]) item['long_desc'] = " | ".join(list(numpy.hstack(ld))) item['short_desc'] = response.selector.xpath( '//h1[@class="heading1"]/text()').extract()[0] item['product_link'] = response.selector.xpath( '//head/link[@rel="canonical"]/@href').extract()[0] item['cat_1'] = "" item['cat_2'] = "" item['cat_3'] = "" item['cat_code'] = "" item['date_added'] = str(time.strftime("%d/%m/%Y %H:%M:%S")) item['date_last_updated'] = str(time.strftime("%d/%m/%Y %H:%M:%S")) item['image_urls'] = "" item['img_1'] = "" item['img_2'] = "" item['img_3'] = "" item['img_4'] = "" item['img_5'] = "" try: item['imglink_1'] = response.selector.xpath( '//ul[@id="carousel"]/li[1]/a/@href').extract()[0] except IndexError: item['imglink_1'] = "" try: item['imglink_2'] = response.selector.xpath( '//ul[@id="carousel"]/li[2]/a/@href').extract()[0] except IndexError: item['imglink_2'] = "" try: item['imglink_3'] = response.selector.xpath( '//ul[@id="carousel"]/li[3]/a/@href').extract()[0] except IndexError: item['imglink_3'] = "" try: item['imglink_4'] = response.selector.xpath( '//ul[@id="carousel"]/li[4]/a/@href').extract()[0] except IndexError: item['imglink_4'] = "" try: item['imglink_5'] = response.selector.xpath( '//ul[@id="carousel"]/li[5]/a/@href').extract()[0] except IndexError: item['imglink_5'] = "" try: item['imglink_6'] = response.selector.xpath( '//ul[@id="carousel"]/li[6]/a/@href').extract()[0] except IndexError: item['imglink_6'] = "" mcats = response.selector.xpath( './/ul[@itemprop="category"]//li//a/text()').extract() for i in range(0, 5): attr = 'mcat_' + str(i + 1) try: item[attr] = mcats[i] except: item[attr] = "" item['mcat_code'] = "" item['merchant'] = "AHAlife" item['merchant_id'] = "SN4NSZ" item['merchant_prod_id'] = find_between(response.url, "/product/", "/") item['is_available'] = True item['currency'] = response.xpath( '//meta[@itemprop="priceCurrency"]/@content').extract()[0] if (item['currency'] == 'USD'): item['currency_symbol'] = '$' else: item['currency_symbol'] = '?' item['price_orig'] = int( float( response.selector.xpath( '//div[@class="product-price sku-price"]/@data-base-price' ).extract()[0])) item['price'] = item['price_orig'] item['price_sale'] = item['price_orig'] item['on_sale'] = False #BOOLEAN item['price_perc_discount'] = 0 item['primary_color'] = "" t = [ item['brand'], item['short_desc'], item['mcat_1'], mcats[1:], item['long_desc'] ] item['tags'] = " ".join(list(numpy.hstack(t))) yield item
def parse(self, response): datetime = int(str(int(time.time()*100))) #Don't change! random.seed(1412112 + datetime) #Don't change! item = NuyolkItem() #Don't change! item['prod_id'] = str(datetime) + str(int(random.uniform(100000, 999999))) #Don't change! item['affiliate_partner'] = "viglink" item['brand'] = "Saks Off 5th" item['long_desc'] = " | ".join(response.selector.xpath('//div[@itemprop="description"]/ul/li/text()').extract()) item['short_desc'] = response.selector.xpath('//div[@class="pdt-short-desc o5-product-short-decription"]/span/text()').extract()[0] item['product_link'] = response.selector.xpath('//head/link[@rel="canonical"]/@href').extract()[0] item['cat_1'] = "" item['cat_2'] = "" item['cat_3'] = "" item['cat_code'] = "" item['date_added'] = [unicode(str(time.strftime("%d/%m/%Y %H:%M:%S")), "utf-8")] item['date_last_updated'] = [unicode(str(time.strftime("%d/%m/%Y %H:%M:%S")), "utf-8")] item['image_urls'] = "" item['img_1'] = "" item['img_2'] = "" item['img_3'] = "" item['img_4'] = "" item['img_5'] = "" try: item['imglink_1'] = "http://image.s5a.com/is/image/saksoff5th/" + response.selector.xpath('//div[@id="js-product-number"]/@data-master-sku').extract()[0] + "_247x329.jpg" except IndexError: item['imglink_1'] = "" try: item['imglink_2'] = "http://image.s5a.com/is/image/saksoff5th/" + response.selector.xpath('//div[@id="js-product-number"]/@data-master-sku').extract()[0] + "A1_247x329.jpg" except IndexError: item['imglink_2'] = "" try: item['imglink_3'] = "http://image.s5a.com/is/image/saksoff5th/" + response.selector.xpath('//div[@id="js-product-number"]/@data-master-sku').extract()[0] + "A2_247x329.jpg" except IndexError: item['imglink_3'] = "" try: item['imglink_4'] = "http://image.s5a.com/is/image/saksoff5th/" + response.selector.xpath('//div[@id="js-product-number"]/@data-master-sku').extract()[0] + "A3_247x329.jpg" except IndexError: item['imglink_4'] = "" try: item['imglink_5'] = "http://image.s5a.com/is/image/saksoff5th/" + response.selector.xpath('//div[@id="js-product-number"]/@data-master-sku').extract()[0] + "A4_247x329.jpg" except IndexError: item['imglink_5'] = "" try: item['imglink_6'] = "http://image.s5a.com/is/image/saksoff5th/" + response.selector.xpath('//div[@id="js-product-number"]/@data-master-sku').extract()[0] + "A5_247x329.jpg" except IndexError: item['imglink_6'] = "" item['mcat_1'] = "" item['mcat_2'] = "" item['mcat_3'] = "" item['mcat_4'] = "" item['mcat_5'] = "" item['mcat_code'] = "" item['merchant'] = "Saks Off 5th" item['merchant_id'] = "E78883" item['merchant_prod_id'] = response.selector.xpath('//div[@id="js-product-number"]/@data-master-sku').extract()[0] item['is_available'] = 'True' #BOOLEAN item['currency'] = "USD" item['currency_symbol'] = "$" try: if (int(float(response.selector.xpath('//span[@class="o5-price-standard"]/text()').extract()[0][1:])) != int(float(response.selector.xpath('//span[@class="price-sales o5-price-sales"]/text()').extract()[0][1:]))): orig = int(float(response.selector.xpath('//span[@class="o5-price-standard"]/text()').extract()[0][1:])) sale = int(float(response.selector.xpath('//span[@class="price-sales o5-price-sales"]/text()').extract()[0][1:])) item['price_orig'] = orig item['price_sale'] = sale item['price_perc_discount'] = int(100-100*(sale/orig)) item['price'] = item['price_sale'] item['on_sale'] = 'True' #BOOLEAN else: item['price_orig'] = int(float(response.selector.xpath('//span[@class="o5-price-standard"]/text()').extract()[0][1:])) item['price'] = item['price_orig'] item['price_sale'] = "" item['on_sale'] = 'False' except IndexError: item['price_orig'] = int(float(response.selector.xpath('//span[@class="o5-price-standard"]/text()').extract()[0][1:])) item['price'] = item['price_orig'] item['price_sale'] = "" item['on_sale'] = 'False' #BOOLEAN item['primary_color'] = "" tags = [str(item['brand']), str(item['short_desc']), str(item['long_desc'])] #str(" ".join(item['mcats'])), item['tags'] = " ".join(tags) yield item
def parse(self, response): time.sleep(0.5) datetime = int(str(int(time.time() * 100))) random.seed(1412112 + datetime) item = NuyolkItem() item['is_available'] = True item['affiliate_partner'] = "viglink" item['prod_id'] = str( str(datetime) + str(int(random.uniform(100000, 999999)))) item['product_link'] = response.url item['merchant'] = "Burke Decor" try: item['merchant_prod_id'] = response.selector.xpath( '//*[@class="product-status"]/text()').extract()[0].replace( "SKU: ", "").strip() except: return #item['upc'] ##TODO item['merchant_id'] = "A82I78" try: item['brand'] = response.selector.xpath( '//*[@class="product_meta"]//a/text()').extract()[0] except: item['brand'] = "" item['short_desc'] = response.selector.xpath( '//*[@itemprop="name"]/@content').extract()[0] try: ld = [ response.selector.xpath( '//p[@itemprop="description"]/following::p/text()'). extract()[0] ] if ld == [u'\xa0']: ld = [] ld2 = response.selector.xpath( '//p[@itemprop="description"]/following::ul[1]//text()' ).extract() ld2 = filter(lambda x: "%" in x or "Finish" in x, ld2) ld.extend(ld2) skipwords = [ "clean", "instructions", "cm", "wash", "in.", "inch", "size", "mm ", "size", "Weight", "Dimensions" ] for w in skipwords: ld = list(np.array(ld)[np.array([w not in x for x in ld])]) item['long_desc'] = " | ".join(ld).strip() except: return ##OOS item['primary_color'] = "" #later item['currency'] = response.selector.xpath( '//meta[@itemprop="priceCurrency"]/@content').extract()[0] if (item['currency'] == 'USD'): item['currency_symbol'] = '$' else: item['currency_symbol'] = '?' ##TODO #If item is on sale, #[4:].replace(",", "") try: item['price_sale'] = int( float( response.selector.xpath('//*[@id="ProductPrice"]/text()'). extract()[0].strip()[1:])) item['price_orig'] = int( float( response.selector.xpath('//*[@id="ComparePrice"]/text()'). extract()[0].strip()[1:].replace(",", ""))) item['price_perc_discount'] = int( (1 - float(item['price_sale']) / float(item['price_orig'])) * 100) item['price'] = item['price_sale'] item['on_sale'] = True except: item['price_orig'] = int( float( response.selector.xpath('//*[@id="ProductPrice"]/text()'). extract()[0].strip()[1:].replace(",", ""))) item['price'] = item['price_orig'] item['price_sale'] = item['price_orig'] item['price_perc_discount'] = 0 item['on_sale'] = False item['image_urls'] = response.selector.xpath( '//*[@class="product-media"]//img//@src').extract() item['image_urls'] = [ 'http:' + x.split('?v=', 1)[0] for x in item['image_urls'] ] #response.selector.xpath('//*[@class="zoom masterTooltip"]/img/@src').extract() #new item['img_1'] = "" item['img_2'] = "" item['img_3'] = "" item['img_4'] = "" item['img_5'] = "" for i in range(0, 6): attr = 'imglink_' + str(i + 1) try: item[attr] = item['image_urls'][i] except: item[attr] = "" mcats = response.xpath('//script[contains(., "fbq(")]/text()').re( 'content_category\: \'([^]]+)') mcats = mcats[0].split(",")[0] mcats = mcats.split(" > ") mcats = filter( lambda x: "All" not in x and "New" not in x and "$" not in x and item['brand'] not in x and "Sale" not in x and "Shop" not in x, mcats) item['mcat_code'] = "" item['image_urls'] = "" for i in range(0, 5): attr = 'mcat_' + str(i + 1) try: if i == len(mcats) - 1: item[attr] = "" else: item[attr] = mcats[i] except: item[attr] = "" item['cat_code'] = "" item['cat_1'] = "" #deprecate item['cat_2'] = "" #deprecate item['cat_3'] = "" #deprecate t = [ item['brand'], item['short_desc'], item['mcat_1'], mcats[1:], item['long_desc'] ] item['tags'] = " ".join(list(numpy.hstack(t))) item['date_added'] = str(time.strftime("%d/%m/%Y %H:%M:%S")) item['date_last_updated'] = str(time.strftime("%d/%m/%Y %H:%M:%S")) yield item
def parse(self, response): def find_between(s, first, last): try: start = s.index(first) + len(first) end = s.index(last, start) return s[start:end] except ValueError: return "" datetime = int(str(int(time.time() * 100))) #Don't change! random.seed(1412112 + datetime) #Don't change! item = NuyolkItem() #Don't change! item['prod_id'] = str(datetime) + str( int(random.uniform(100000, 999999))) #Don't change! item['affiliate_partner'] = "viglink" #item['brand'] = response.selector.xpath('//div[@id = "productTabs"]/div[@id="ctl00_ContentMainPage_brandInfoPanel"]/a[1]/strong/text()').extract()[0] item['brand'] = response.xpath('//title/text()').extract_first().split( ' | ')[0] descs = response.selector.xpath( '//div[@class="product-description"]/span//text()').extract() descs = list(filter(lambda a: a != ' ', descs)) skipwords = [ "clean", "instructions", "cm", "wash", "in.", "inch", "size", "mm ", "size" ] for w in skipwords: descs = list( np.array(descs)[np.array([w not in x for x in descs])]) item['long_desc'] = "".join(descs[0:3]) + " | " + " | ".join( descs[3:len(descs)]) #item['long_desc'] = " | ".join(response.selector.xpath('//div[@id="ctl00_ContentMainPage_productInfoPanel"]/ul/li/text()').extract()) #item['short_desc'] = response.selector.xpath('//div[@class="title"]/h1/span[@class="product_title"]/text()').extract()[0] item['short_desc'] = response.selector.xpath( '//div[@class="product-hero"]//h1/text()').extract()[0] item['product_link'] = response.selector.xpath( '//head/link[@rel="canonical"]/@href').extract()[0] item['cat_1'] = "" item['cat_2'] = "" item['cat_3'] = "" item['cat_code'] = "" item['date_added'] = str(time.strftime("%d/%m/%Y %H:%M:%S")) item['date_last_updated'] = str(time.strftime("%d/%m/%Y %H:%M:%S")) item['image_urls'] = "" item['img_1'] = "" item['img_2'] = "" item['img_3'] = "" item['img_4'] = "" item['img_5'] = "" try: item['imglink_1'] = response.selector.xpath( '//div[@class="product-gallery"]//ul/li[1]/img/@src').extract( )[0] except IndexError: item['imglink_1'] = "" try: item['imglink_2'] = response.selector.xpath( '//div[@class="product-gallery"]//ul/li[2]//img/@src').extract( )[0] item['imglink_2'] = item['imglink_2'].replace( "S$&wid=40", "XXL$&wid=513") except IndexError: item['imglink_2'] = "" try: item['imglink_3'] = response.selector.xpath( '//div[@class="product-gallery"]//ul/li[3]//img/@src').extract( )[0] item['imglink_3'] = item['imglink_3'].replace( "S$&wid=40", "XXL$&wid=513") except IndexError: item['imglink_3'] = "" try: item['imglink_4'] = response.selector.xpath( '//div[@class="product-gallery"]//ul/li[4]//img/@src').extract( )[0] item['imglink_4'] = item['imglink_4'].replace( "S$&wid=40", "XXL$&wid=513") except IndexError: item['imglink_4'] = "" try: item['imglink_5'] = response.selector.xpath( '//div[@class="product-gallery"]//ul/li[5]//img/@src').extract( )[0] item['imglink_5'] = item['imglink_5'].replace( "S$&wid=40", "XXL$&wid=513") except IndexError: item['imglink_5'] = "" try: item['imglink_6'] = response.selector.xpath( '//div[@class="product-gallery"]//ul/li[6]//img/@src').extract( )[0] item['imglink_6'] = item['imglink_6'].replace( "S$&wid=40", "XXL$&wid=513") except IndexError: item['imglink_6'] = "" mcats = response.selector.xpath( '//*[@id="more-from"]/descendant::a/text()').extract() for i in range(0, 5): attr = 'mcat_' + str(i + 1) try: item[attr] = mcats[i] except: item[attr] = "" item['mcat_code'] = "" item['merchant'] = "ASOS US" item['merchant_id'] = "IU95X3" item['merchant_prod_id'] = str( response.selector.xpath( '//*[@class="product-code"]//span/text()').extract()[0]) item['is_available'] = True #BOOLEAN p = "\n".join( response.selector.xpath( '//script[contains(., "current")]/text()').extract()) item['currency'] = find_between(p, '"currency":"', '",')[0:3] if (item['currency'] == 'USD'): item['currency_symbol'] = '$' else: item['currency_symbol'] = '?' item['price'] = int(float(find_between(p, '"current":', ","))) prev = float(find_between(p, '"previous":', ",")) rrp = float(find_between(p, '"rrp":', ",")) if (prev == 0 and rrp == 0): item['price_orig'] = item['price'] item['price_sale'] = item['price'] item['price_perc_discount'] = 0 item['on_sale'] = False else: item['price_sale'] = item['price'] if (prev > 0): item['price_orig'] = int(prev) elif (rrp > 0): item['price_orig'] = int(rrp) else: item['price_orig'] = int(0) ###TODO ??? item['on_sale'] = True item['price_perc_discount'] = int( 100 - 100 * (item['price_sale'] / item['price_orig'])) item['primary_color'] = "" tags = [ str(item['brand']), str(item['short_desc']), str(item['long_desc']) ] #str(" ".join(item['mcats'])), item['tags'] = " ".join(tags) yield item
def parse(self, response): datetime = int(str(int(time.time() * 100))) #Don't change! random.seed(1412112 + datetime) #Don't change! item = NuyolkItem() #Don't change! item['prod_id'] = str(datetime) + str( int(random.uniform(100000, 999999))) #Don't change! item['affiliate_partner'] = "viglink" try: item['brand'] = response.selector.xpath( '//meta[@name="twitter:data2"]/@content').extract()[0] except: return ld = response.selector.xpath( '//div[@class="product-details__content js-tabs__content js-tabs__content-active product-details__description"]/ul/li/text()' ).extract() if (len(ld) >= 7): ld = ld[:7] ld = filter(lambda x: "Style No." not in x and " cm" not in x, ld) item['long_desc'] = " | ".join(ld) item['short_desc'] = response.selector.xpath( '//*[@class="product-titles"]//h1/text()').extract()[0].strip() item['product_link'] = response.selector.xpath( '//head/link[@rel="canonical"]/@href').extract()[0] item['cat_1'] = "" item['cat_2'] = "" item['cat_3'] = "" item['cat_code'] = "" item['date_added'] = str(time.strftime("%d/%m/%Y %H:%M:%S")) item['date_last_updated'] = str(time.strftime("%d/%m/%Y %H:%M:%S")) item['image_urls'] = "" item['img_1'] = "" item['img_2'] = "" item['img_3'] = "" item['img_4'] = "" item['img_5'] = "" try: item['imglink_1'] = response.selector.xpath( '//div[@id="js-primary-slideshow__pager"]/a[1]/@data-image' ).extract()[0] except IndexError: item['imglink_1'] = "" try: item['imglink_2'] = response.selector.xpath( '//div[@id="js-primary-slideshow__pager"]/a[2]/@data-image' ).extract()[0] except IndexError: item['imglink_2'] = "" try: item['imglink_3'] = response.selector.xpath( '//div[@id="js-primary-slideshow__pager"]/a[3]/@data-image' ).extract()[0] except IndexError: item['imglink_3'] = "" try: item['imglink_4'] = response.selector.xpath( '//div[@id="js-primary-slideshow__pager"]/a[4]/@data-image' ).extract()[0] except IndexError: item['imglink_4'] = "" try: item['imglink_5'] = response.selector.xpath( '//div[@id="js-primary-slideshow__pager"]/a[5]/@data-image' ).extract()[0] except IndexError: item['imglink_5'] = "" try: item['imglink_6'] = response.selector.xpath( '//div[@id="js-primary-slideshow__pager"]/a[6]/@data-image' ).extract()[0] except IndexError: item['imglink_6'] = "" mcats = response.selector.xpath( '//*[@class="pdp_lower_area"]/div[5]//li//text()').extract()[1:] mcats = [x.strip() for x in mcats] mcats = filter( lambda x: x != "" and x != item['brand'] and "REVOLVE" not in x, mcats) for i in range(0, 5): attr = 'mcat_' + str(i + 1) try: if i == len(mcats) - 1: item[attr] = "" else: item[attr] = mcats[i] except: item[attr] = "" item['mcat_code'] = "" item['merchant'] = "REVOLVE" item['merchant_id'] = "35KQ17" item['merchant_prod_id'] = response.selector.xpath( '//input[@id="productCode"]/@value').extract()[0] item['is_available'] = True item['currency'] = response.selector.xpath( '//meta[@property="wanelo:product:price:currency"]/@content' ).extract()[0] if (item['currency'] == 'USD'): item['currency_symbol'] = '$' else: item['currency_symbol'] = '?' ##TODO try: sale = int( float( response.selector.xpath( '//div[@class="prices__retail--strikethrough"]/preceding::div/text()' ).extract()[-1][2:].replace(',', ''))) orig = int( float( response.selector.xpath( '//div[@class="prices__retail--strikethrough"]//text()' ).extract()[0][2:].replace(',', ''))) if (orig != sale): item['price_orig'] = orig item['price_sale'] = sale item['price_perc_discount'] = int(100 - 100 * (float(sale) / float(orig))) item['price'] = item['price_sale'] item['on_sale'] = True else: item['price_orig'] = orig item['price'] = item['price_orig'] item['price_sale'] = item['price_orig'] item['price_perc_discount'] = 0 item['on_sale'] = False except IndexError: try: item['price_orig'] = int( float( response.selector.xpath( '//meta[@itemprop="price"]/@content').extract() [0])) except: return item['price'] = item['price_orig'] item['price_sale'] = item['price_orig'] item['on_sale'] = False #BOOLEAN item['price_perc_discount'] = 0 item['primary_color'] = "" t = [ item['brand'], item['short_desc'], item['mcat_1'], mcats[1:], item['long_desc'] ] item['tags'] = " ".join(list(numpy.hstack(t))) yield item
def parse(self, response): datetime = int(str(int(time.time() * 100))) #Don't change! random.seed(1412112 + datetime) #Don't change! item = NuyolkItem() #Don't change! try: item['brand'] = response.selector.xpath( '//span[@class="brand"]/a/text()').extract()[0] item['cat_code'] = "" item['cat_1'] = "" #deprecate item['cat_2'] = "" #deprecate item['cat_3'] = "" #deprecate item['currency'] = str( response.selector.xpath( '//*[@class="translateFlag"]/a/span/text()').extract()[0]) response.selector.path('//*[@id="currencyLink"]/span') item['date_added'] = str(time.strftime("%d/%m/%Y %H:%M:%S")) images = [] images.append( response.selector.xpath( '//div[@class="productImage"]//img/@data-rvsrc').extract() [0]) if images[0] == []: images = [] images.append( response.selector.xpath( '//*[@class="pImgWrap"]/img/@src').extract()) #item['image_urls'] = images item['img_1'] = "" item['img_2'] = "" item['img_3'] = "" item['img_4'] = "" item['img_5'] = "" long_desc = response.selector.xpath( '//div[@class="selfridgesSaysInner"]/div/p[@class="hiddenDescription"]/text()' ).extract()[0].strip() ##encoding problem item['long_desc'] = long_desc.replace("<b>", "").replace("</b>", "") item['mcats'] = "" #later #Do NLP predictions item['mcat_1'] = "" #later #Do NLP predictions item['mcat_2'] = "" #later #Do NLP predictions item['mcat_3'] = "" #later #Do NLP predictions item['mcat_4'] = "" #later #Do NLP predictions item['mcat_5'] = "" #later #Do NLP predictions item['merchant_id'] = "TO663Y" item['merchant_prod_id'] = str( response.selector.xpath( '//p[@class="pcode"]/span[@class="val"]/text()').extract() [0].strip()) try: orig = int( float( str( response.selector.xpath( '//p[@class="wasPrice"]/text()').extract() [0]).strip().replace(",", "")[1:])) sale = int( float( str( response.selector.xpath( '//p[@class="price red"]/span[2]/text()'). extract()[0]).strip().replace(",", ""))) if (orig != sale): item['price_orig'] = orig item['price_sale'] = sale item['price_perc_discount'] = int(100 - 100 * sale / orig) item['price'] = sale else: item['price_orig'] = orig item['price'] = orig except IndexError: item['price_orig'] = int( float( str( response.selector.xpath( '//p[@class="price"]/span[2]/text()').extract( )[0]).strip().replace(",", ""))) item['price'] = item['price_orig'] item['primary_color'] = "" #later item['prod_id'] = int( str(datetime) + str(int(random.uniform(100000, 999999)))) #Don't change! item['product_link'] = response.selector.xpath( '//link[@rel="canonical"]/@href').extract()[0] item['short_desc'] = str( response.selector.xpath('//head/title/text()').extract()[0]) tags = [str(item['brand']), item['short_desc'], item['long_desc']] #str(" ".join(item['mcats'])), item['tags'] = " ".join(tags) item['date_last_updated'] = str(time.strftime("%d/%m/%Y %H:%M:%S")) item['merchant'] = 'Selfridges' item['imglinks'] = images for i in range(0, 6): attr = 'imglink_' + str(i + 1) if i < len(item['imglinks']): item[attr] = str(item['imglinks'][i]) else: item[attr] = "" item['is_available'] = True #Don't change! #Fix later! item['affiliate_partner'] = "viglink" yield item except Exception as e: return
def parse(self, response): datetime = int(str(int(time.time() * 100))) #Don't change! random.seed(1412112 + datetime) #Don't change! item = NuyolkItem() #Don't change! item['prod_id'] = str(datetime) + str( int(random.uniform(100000, 999999))) #Don't change! item['affiliate_partner'] = "viglink" item['brand'] = "Harrods" item['brand'].encode('utf-8', 'ignore') try: item['long_desc'] = response.selector.xpath( '//p[@class="description"]/text()').extract()[0] item['long_desc'].encode('utf-8', 'ignore') except IndexError: item['long_desc'] = '' try: item['short_desc'] = response.selector.xpath( '//span[@class="productname"]/text()').extract()[0].strip() item['short_desc'].encode('utf-8', 'ignore') except IndexError: return item['product_link'] = response.selector.xpath( '//head/link[@rel="canonical"]/@href').extract()[0] item['cat_1'] = "" item['cat_2'] = "" item['cat_3'] = "" item['cat_code'] = "" item['date_added'] = unicode(str(time.strftime("%d/%m/%Y %H:%M:%S")), "utf-8") item['date_last_updated'] = unicode( str(time.strftime("%d/%m/%Y %H:%M:%S")), "utf-8") item['image_urls'] = "" item['img_1'] = "" item['img_2'] = "" item['img_3'] = "" item['img_4'] = "" item['img_5'] = "" try: item['imglink_1'] = response.selector.xpath( '//ul[@class="alt_view"]/li[1]/a/@href').extract()[0] except IndexError: item['imglink_1'] = "" try: item['imglink_2'] = response.selector.xpath( '//ul[@class="alt_view"]/li[2]/a/@href').extract()[0] except IndexError: item['imglink_2'] = "" try: item['imglink_3'] = response.selector.xpath( '//ul[@class="alt_view"]/li[3]/a/@href').extract()[0] except IndexError: item['imglink_3'] = "" try: item['imglink_4'] = response.selector.xpath( '//ul[@class="alt_view"]/li[4]/a/@href').extract()[0] except IndexError: item['imglink_4'] = "" try: item['imglink_5'] = response.selector.xpath( '//ul[@class="alt_view"]/li[5]/a/@href').extract()[0] except IndexError: item['imglink_5'] = "" try: item['imglink_6'] = response.selector.xpath( '//ul[@class="alt_view"]/li[6]/a/@href').extract()[0] except IndexError: item['imglink_6'] = "" item['mcat_1'] = "" item['mcat_2'] = "" item['mcat_3'] = "" item['mcat_4'] = "" item['mcat_5'] = "" item['mcat_code'] = "" item['merchant'] = "Harrods" item['merchant_id'] = "2GSE52" item['merchant_prod_id'] = response.selector.xpath( '//span[@class="product_code"]/text()').extract()[0][13:] item['is_available'] = 'True' #BOOLEAN item['currency'] = response.selector.xpath( '//span[@class="country-selector_currency"]/text()').extract()[0] item['currency_symbol'] = response.selector.xpath( '//span[@class="country-selector_currency"]/span[@class="code"]/text()' ).extract()[0] item['price'] = int( float( response.selector.xpath( '//span[@class="prices price"]/span/span/text()').extract( )[0][1:])) item['price_orig'] = int( float( response.selector.xpath( '//span[@class="prices price"]/span/span/text()').extract( )[0][1:])) item['price_sale'] = int( float( response.selector.xpath( '//span[@class="prices price"]/span/span/text()').extract( )[0][1:])) item['price_perc_discount'] = 0 item['on_sale'] = 'False' ''' try: if (int(float(response.selector.xpath('//span[@class="prices price"]/span[@class="was"]/text()').extract()[0][1:])) != int(float(response.selector.xpath('//span[@class="prices price"]/span[@class="now"]/text()').extract()[0][5:]))): orig = int(float(response.selector.xpath('//span[@class="prices price"]/span[@class="was"]/text()').extract()[0][1:])) sale = int(float(response.selector.xpath('//span[@class="prices price"]/span[@class="now"]/text()').extract()[0][5:])) item['price_orig'] = orig item['price_sale'] = sale item['price_perc_discount'] = int(100-100*(sale/orig)) item['price'] = item['price_sale'] item['on_sale'] = 'True' #BOOLEAN else: item['price_orig'] = int(float(response.selector.xpath('//span[@class="prices price"]/span[@class="was"]/text()').extract()[0][1:])) item['price'] = item['price_orig'] item['price_sale'] = "" item['on_sale'] = 'False' except IndexError: item['price_orig'] = int(float(response.selector.xpath('//dd[@class="product-pricing__price"]/span[@itemprop="price"]/text()').extract()[0])) item['price'] = item['price_orig'] item['price_sale'] = "" item['on_sale'] = 'False' #BOOLEAN ''' item['primary_color'] = "" tags = [ str(item['brand']), str(item['short_desc']), str(item['long_desc']) ] #str(" ".join(item['mcats'])), item['tags'] = " ".join(tags) yield item