def parse(self, response): item = BaseItem() speci_list = [] pack_list = [] intro_list = [] details_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' try: item['productUrl'] = response.url except: pass try: item['productName'] = response.xpath("//div[@class='productDetail product-detail-repair']/h1/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass
def parse(self, response): item = BaseItem() details_list = [] pack_list = [] intro_list = [] speci_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' try: classification_one = response.xpath("//div[@class='breadcrumbs']/a[3]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() classification_two = response.xpath("//div[@class='breadcrumbs']/a[4]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() classification_three = response.xpath("//div[@class='breadcrumbs']/a[5]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass classification = classification_one + '|||' + classification_two + '|||' +classification_three try: item['productUrl'] = response.url except: pass try: item['productName'] = response.xpath('//div[@class="tl-wrap-g"]/div[@class = "title"]/h3/b/text()').extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass
def parse(self, response): #filename = response.url.split("/")[-2] #open(filename, 'wb').write(response.body) #start browser #self.driver.get(response.url) #loading time interval #time.sleep(5) item = BaseItem() speci_list = [] pack_list = [] intro_list = [] details_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' try: item['productUrl'] = response.url except: pass try: item['productName'] = response.xpath("//div[@class='textInfo']/form/div/h1/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass
def parse(self, response): item = BaseItem() item = BaseItem() item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' classification_one = '' classification_two = '' classification_three = '' try: classification_two = response.xpath( "//dd[@class='crumb_item'][1]/a/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() classification_three = response.xpath( "//div[@class='crumb']/dl/dd[@class='crumb_item'][2]/a/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() classification_one = response.xpath( "//dd[@class='crumb_item'][13]/a/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass classification = classification_one + '|||' + classification_two + '|||' + classification_three try: item['productUrl'] = response.url except: pass try: item['productName'] = response.xpath( "//form[@id='form1']/ul/li[@class='tit']/text()").extract( )[0].encode('utf-8').replace("\"", "\'").strip() except: pass list_brand = '' for j in range(1, 20): try: list_brand = response.xpath( "//form[@id='form1']/ul/li[%i]/label/text()" % j).extract()[0].encode('utf-8').replace("\"", "\'").strip() if '品 牌:' == list_brand: item['productBrand'] = response.xpath( "//form[@id='form1']/ul/li[%i]/text()" % j).extract()[0].encode('utf-8').replace("\"", "\'").strip() break except: pass try: item['productModel'] = response.xpath( '//div[@class="m m1"]/div/ul/dt/li/text()').extract( )[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: item['productClassification'] = classification except: pass try: #去空格 转分 去人民币符号 item['productPrice'] = response.xpath( "//strong[@class='orange price_tit']/text()").extract( )[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: item['productPrice'] = str( float( filter(lambda ch: ch in '0123456789.~', item['productPrice'])) * 100) except: pass #图片连接 try: item['productImagePath'] = response.xpath( "//div[@class='bd']/div/div/p/span/img/@src").extract( )[0].encode('utf-8').replace("\"", "\'").strip() except: pass #print item['image_urls'],"777777" try: item['productAddres'] = response.xpath( "//form[@id='form1']/ul/li[4]/text()").extract()[0] except: pass try: item['productCompany'] = "" except: pass names = self.name + '.json' try: item['fileName'] = names except: pass yield item
def parse(self, response): item = BaseItem() speci_list = [] pack_list = [] intro_list = [] details_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' try: classification_one = response.xpath( "//div[@class='Navigation']/span[3]/a/text()").extract( )[0].encode('utf-8').replace("\"", "\'").strip() classification_two = response.xpath( "//div[@class='Navigation']/span[5]/a/text()").extract( )[0].encode('utf-8').replace("\"", "\'").strip() classification_three = response.xpath( "//div[@class='Navigation']/span[7]/a/text()").extract( )[0].encode('utf-8').replace("\"", "\'").strip() except: pass classification = classification_one + '|||' + classification_two + '|||' + classification_three try: item['productUrl'] = response.url except: pass try: item['productName'] = response.xpath( "//form[@class='goods-action']/h1[@class='goodsname']/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass list_brand = '' for j in range(1, 20): try: list_brand = response.xpath( "//ul[@class='goodsprops clearfix']/li[%i]/span/text()" % j).extract()[0].encode('utf-8').replace("\"", "\'").strip() if '品 牌:' in list_brand: item['productBrand'] = response.xpath( "//ul[@class='goodsprops clearfix']/li[%i]/a/text()" % j).extract()[0].encode('utf-8').replace("\"", "\'").strip() break except: pass try: item['productModel'] = response.xpath( "//ul[@class='goodsprops clearfix']/li/text()").extract( )[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: item['productClassification'] = classification except: pass try: #去空格 转分 去人民币符号 item['productPrice'] = response.xpath( "//ul[@class='goods-price list']/li/span[@class='price1']/text()" ).extract()[0].encode('utf-8').replace("¥", "").strip() except: pass try: item['productPrice'] = str( float( filter(lambda ch: ch in '0123456789.~', item['productPrice'])) * 100) except: pass #图片连接 try: item['productImagePath'] = response.xpath( "//div[@class='goodspic']/div[@class='goods-detail-pic']/a/img/@src" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass #print item['image_urls'],"777777" try: item['productAddres'] = response.xpath( "//form[@id='form1']/ul/li[4]/text()").extract()[0] except: pass try: item['productCompany'] = "" except: pass names = self.name + '.json' try: item['fileName'] = names except: pass list_details = response.xpath( "//div[@id='para']/table[1]/tbody[2]/tr/td/text()").extract() logging.info("-------list_details_len=%i" % len(list_details)) list_pack = response.xpath( "//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()" ).extract() intro = response.xpath("//span[@id='PDescriptiion']/text()").extract() logging.info("-------intr_len=%i" % len(intro)) speci = response.xpath("//span[@id='techParam']/text()").extract() logging.info("-------intr_len=%i" % len(speci)) num_one = 1 for value_details in list_details: value_details = value_details.encode('utf-8').replace( ":", "\/").replace("\n", "").replace("\"", "").strip() if num_one % 2 == 2: num_one = 1 continue if num_one % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '品牌' in value_details: num_one = 0 continue if '型号' in value_details: num_one = 0 continue data2['attrkey'] = value_details else: if num_one == 0: num_one = 1 continue data2['keyname'] = value_details details_list.append(data2) num_one += 1 num_two = 1 for list_intro in intro: list_intro = list_intro.encode('utf-8').replace("\n", "").replace( "\"", "").strip() list_intro = list_intro.split(':') for value_intro in list_intro: if num_two % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_intro: break if '品牌' in value_intro: break data2['attrkey'] = value_intro else: data2['keyname'] = value_intro intro_list.append(data2) num_two += 1 num_three = 1 for list_speci in speci: list_speci = list_speci.encode('utf-8').replace("\n", "").replace( "\"", "").strip() list_speci = list_speci.split(':') for value_speci in list_speci: if num_three % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_speci: break if '品牌' in value_speci: break data2['attrkey'] = value_speci else: data2['keyname'] = value_speci speci_list.append(data2) num_three += 1 product_intro = response.xpath( "//span[@id='PDescription']/text()").extract() product_pack = response.xpath( "//td[@id='imgDiv']/div[@id='div3']/font/b/text()").extract() product_details = response.xpath( "//div[@id='goods-intro']/p/text()").extract() filename = self.name + ".txt" file = open("data/" + filename, 'a+') file.write("\n" + "productUrl:" + response.url + "\n") file.write("productDetails:" + "\n") for details in product_details: details = details.encode('utf-8').replace("\b", "").replace( "<br/>", "").replace("<br>", "").strip() file.write(details + "\n") file.close() item['productSpeci'] = speci_list item['productPack'] = pack_list item['productIntro'] = intro_list item['productDetails'] = details_list yield item
def parse(self, response): item = BaseItem() speci_list = [] pack_list = [] intro_list = [] details_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' print "PhantomJS is starting1..." driver = webdriver.PhantomJS() driver.get(response.url) #time.sleep(3) body = driver.page_source #driver.close() HtmlResponses = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=response) try: item['productUrl'] = response.url except: pass try: item['productName'] = response.xpath( "//div[@id='product_information']/div[@class='product-titles']/h2/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: item['productBrand'] = response.xpath( "//div[@class='product-concerns']/ul/li[@class='item'][3]/span[@class='detail']/i[@class='minor']/em[@class='action-mktprice']/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: item['productModel'] = response.xpath( "//div[@class='product-concerns']/ul/li[@class='item'][4]/span[@class='detail']/i[@class='minor']/em[@class='action-mktprice']/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: classification_one = response.xpath( "//div[@id='p_navbar']/a[2]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() classification_two = response.xpath( "//div[@id='p_navbar']/a[3]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() classification_three = response.xpath( "//div[@id='p_navbar']/a[4]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() item[ 'productClassification'] = classification_one + '|||' + classification_two + '|||' + classification_three except: pass try: item['productPrice'] = HtmlResponses.xpath( "//ins[@class='action-price']/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass try: item['productPrice'] = str( float( filter(lambda ch: ch in '0123456789.~', item['productPrice'])) * 100) except: pass try: ImagePath = response.xpath( '//div[@class="product-album-pic"]/a/@href').extract( )[0].encode('utf-8').replace("\"", "\'").strip() ImagePath = ImagePath.split('?')[-2] item['productImagePath'] = ImagePath except: pass try: item['productAddres'] = response.xpath( "//form[@id='form1']/ul/li[4]/text()").extract()[0] except: pass try: item['productCompany'] = "" except: pass names = self.name + '.json' try: item['fileName'] = names except: pass list_details = response.xpath( "//div[@id='product_detail']/div[@class='product-attributes']/ul[@class='clearfix']/li/text()" ).extract() details = response.xpath( "//ul[@class='inLeft_attributes']/li/span/text()").extract() logging.info("-------list_details_len=%i" % len(list_details)) logging.info("-------details_len=%i" % len(details)) list_pack = response.xpath( "//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()" ).extract() intro = response.xpath("//span[@id='PDescription']/text()").extract() logging.info("-------intr_len=%i" % len(intro)) speci = response.xpath("//span[@id='techParam']/text()").extract() logging.info("-------intr_len=%i" % len(speci)) num_one = 1 for value_details in list_details: value_details = value_details.encode('utf-8').replace( "\"", "\'").strip() if '品牌' in value_details: continue else: details = value_details.split(':') data2 = {} data2['attrkey'] = '' data2['keyname'] = '' data2['attrkey'] = details[0] data2['keyname'] = details[1] details_list.append(data2) num_one += 1 num_two = 1 for list_intro in intro: list_intro = list_intro.encode('utf-8').replace("\n", "").replace( "\"", "").strip() list_intro = list_intro.split(':') for value_intro in list_intro: if num_two % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_intro: break if '品牌' in value_intro: break data2['attrkey'] = value_intro else: data2['keyname'] = value_intro intro_list.append(data2) num_two += 1 num_three = 1 for list_speci in speci: list_speci = list_speci.encode('utf-8').replace("\n", "").replace( "\"", "").strip() list_speci = list_speci.split(':') for value_speci in list_speci: if num_three % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_speci: break if '品牌' in value_speci: break data2['attrkey'] = value_speci else: data2['keyname'] = value_speci speci_list.append(data2) num_three += 1 item['productSpeci'] = speci_list item['productPack'] = pack_list item['productIntro'] = intro_list item['productDetails'] = details_list yield item
def parse(self, response): item = BaseItem() speci_list = [] pack_list = [] intro_list = [] details_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' try: classification_one = response.xpath( "//div[@class='layout']/div[@class='path']/a[2]/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() classification_two = response.xpath( "//div[@class='layout']/div[@class='path']/a[3]/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() classification_three = response.xpath( "//div[@class='layout']/div[@class='path']/a[4]/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass classification = classification_one + '|||' + classification_two + '|||' + classification_three item['productUrl'] = response.url try: item['productName'] = response.xpath( "//div[@class='prodetails']/h1[@class='protitle']/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass list_brand = [] try: list_brand = response.xpath( "//ul[@class='list2 clf']/li[@class='itm']/span[@class='dt']/text()" ).extract() for j in range(1, len(list_brand)): brand = response.xpath( "//ul[@class='list2 clf']/li[@class='itm'][%i]/span[@class='dt']/text()" % j).extract()[0].encode('utf-8') model = response.xpath( "//ul[@class='list2 clf']/li[@class='itm'][%i]/span[@class='dt']/text()" % j).extract()[0].encode('utf-8') if '品牌' in brand: item['productBrand'] = response.xpath( "//ul[@class='list1 clf']/li[@class='itm'][%i]/span[@class='dd']/text()" % j).extract()[0].encode('utf-8').replace( "\"", "\'").strip() if '型号' in model: item['productModel'] = response.xpath( "//ul[@class='list1 clf']/li[@class='itm'][%i]/span[@class='dd']/text()" % j).extract()[0].encode('utf-8').replace( "\"", "\'").strip() except: pass try: item['productClassification'] = classification except: pass try: price = response.xpath("//em[@class='prc']/b/text()").extract( )[0].encode('utf-8').replace("\"", "\'").strip() if price == '': price = '0.0' except: pass try: item['productPrice'] = str( float(filter(lambda ch: ch in '0123456789.~', price)) * 100) except: pass try: item['productImagePath'] = "http:" + response.xpath( "//li[@class='img-itm active']/div[@class='img-box']/img/@src" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass item['productAddres'] = "" item['productCompany'] = "" names = self.name + '.json' item['fileName'] = names list_details = response.xpath( "//ul[@class='list1 clf']/li[@class='itm']/span/text()").extract() logging.info("-------list_details_len=%i" % len(list_details)) list_pack = response.xpath( "//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()" ).extract() intro = response.xpath("//span[@id='PDescriptiion']/text()").extract() logging.info("-------intr_len=%i" % len(intro)) speci = response.xpath("//span[@id='techParam']/text()").extract() logging.info("-------intr_len=%i" % len(speci)) num_one = 1 for value_details in list_details: value_details = value_details.encode('utf-8').replace( ":", "").replace("\n", "").replace("\"", "").strip() if num_one % 2 == 2: num_one = 1 continue if num_one % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '品牌' in value_details: num_one = 0 continue if '型号' in value_details: num_one = 0 continue data2['attrkey'] = value_details else: if num_one == 0: num_one = 1 continue data2['keyname'] = value_details details_list.append(data2) num_one += 1 num_two = 1 for list_intro in intro: list_intro = list_intro.encode('utf-8').replace("\n", "").replace( "\"", "").strip() list_intro = list_intro.split(':') for value_intro in list_intro: if num_two % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_intro: break if '品牌' in value_intro: break data2['attrkey'] = value_intro else: data2['keyname'] = value_intro intro_list.append(data2) num_two += 1 num_three = 1 for list_speci in speci: list_speci = list_speci.encode('utf-8').replace("\n", "").replace( "\"", "").strip() list_speci = list_speci.split(':') for value_speci in list_speci: if num_three % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_speci: break if '品牌' in value_speci: break data2['attrkey'] = value_speci else: data2['keyname'] = value_speci speci_list.append(data2) num_three += 1 product_details = response.xpath( "//div[@class='pro-main']/div[@class='con'][1]/text()").extract() filename = self.name + ".txt" file = open("data/" + filename, 'a+') file.write("\n" + "productUrl:" + response.url + "\n") file.write("productDetails:" + "\n") for details in product_details: details = details.encode('utf-8').replace("\b", "").replace( "<br/>", "").replace("<br>", "").strip() file.write(details + "\n") file.close() item['productSpeci'] = speci_list item['productPack'] = pack_list item['productIntro'] = intro_list item['productDetails'] = details_list yield item
def parse(self, response): item = BaseItem() details_list = [] pack_list = [] intro_list = [] speci_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' try: item['productUrl'] = response.url except: pass try: item['productName'] = response.xpath("//h1[@class='prodbaseinfo_title']/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()+"eoriutqirwe" except: pass try: item['productBrand'] = response.xpath("//ul[@class='ul_list']/li[@class='fg14'][1]/div[1]/a/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass try: item['productModel'] = response.xpath("//ul[@class='ul_list']/li[@class='fg14'][2]/p/font/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass try: classification_one = response.xpath("//div[@class='location']/a[2]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() classification_two = response.xpath("//div[@class='location']/a[3]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() classification_three = response.xpath("//div[@class='location']/a[4]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() item['productClassification'] = classification_one + '|||' + classification_two + '|||' +classification_three except: pass try: item['productPrice'] = response.xpath("//ul[@class='ul_list']/li[@class='fg14'][3]/p/span[@id='attr_price']/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass try: item['productPrice'] = str(float(filter(lambda ch: ch in '0123456789.~', item['productPrice']))*100) except: pass try: item['productImagePath'] = response.xpath('//div[@id="wrap"]/a/@href').extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass try: item['productAddres'] = "" except: pass try: item['productCompany'] = "" except: pass names = self.name+'.json' try: item['fileName'] = names except: pass list_details = response.xpath("//div[@id='para']/table[1]/tbody[2]/tr/td/text()").extract() logging.info("-------list_details_len=%i" %len(list_details)) list_pack = response.xpath("//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()").extract() intro = [] try: intro_p = response.xpath("//li[@class='pb10p']/blockquote/table[2]/tbody/tr/td/p/text()").extract() logging.info("-------intro_p_len=%i" %len(intro_p)) intro = response.xpath("//li[@class='pb10pppp']/blockquote/table[2]/tbody/tr/td/text()").extract() logging.info("-------intro_len=%i" %len(intro)) except: pass speci = [] try: speci_p = response.xpath("//li[@class='pb10p']/blockquote/table[2]/td/p/text()").extract() logging.info("-------speci_p_len=%i" %len(speci_p)) speci = response.xpath("//li[@class='pb10pppp']/blockquote/table[2]/tbody/text()").extract() logging.info("-------speci_len=%i" %len(speci)) except: pass num_one=1 for value_details in list_details : value_details = value_details.encode('utf-8').replace(":","\/").replace("\n","").replace("\"","").strip() if num_one%2==2 : num_one = 1 continue if num_one%2==1 : data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '品牌' in value_details: num_one=0 continue if '型号' in value_details: num_one=0 continue data2['attrkey']=value_details else: if num_one ==0: num_one = 1 continue data2['keyname']=value_details details_list.append(data2) num_one+=1 num_two=1 for list_intro in intro: list_intro = list_intro.encode('utf-8').replace("\n","").replace(":","").strip() if num_two%2==1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '型号' in list_intro: num_two +=1 continue data2['attrkey']=list_intro else: if num_two == 3: num_two +=1 continue data2['keyname']=list_intro if num_two == 4: num_two =6 continue intro_list.append(data2) num_two +=1 num_three=1 for list_speci in speci: list_speci = list_speci.encode('utf-8').replace("\n","").replace("\"","").strip() if num_three%2==1 : data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in list_speci: break if '品牌' in list_speci: break data2['attrkey']=list_speci else: data2['keyname']=list_speci speci_list.append(data2) num_three+=1 intro_file = response.xpath("//li[@class='pb10p']/blockquote/table[2]/tbody/tr/td/text()").extract() filename = self.name+".txt" file = open("data/"+filename, 'a+') file.write("\n"+"productUrl:"+response.url+"\n") file.write("\n"+"productIntro:"+"\n") for list_intro in intro_file: list_intro = list_intro.encode('utf-8').replace("\n","").replace(":","").strip() file.write(list_intro+"\n") file.close() item['productSpeci'] = speci_list item['productPack'] = pack_list item['productIntro'] = intro_list item['productDetails'] = details_list yield item
def parse(self, response): item = BaseItem() details_list = [] pack_list = [] intro_list = [] speci_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' try: item['productUrl'] = response.url except: pass try: item['productName'] = response.xpath( '//div[@class="goods_info"]/div/h1/text()').extract( )[0].encode('utf-8').replace("\"", "\'").strip() except: pass brand = '' try: brand = response.xpath( "//div[@id='con_goods_1']/ul[@class='detail-list']/li[3]/a/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass item['productBrand'] = brand try: item['productModel'] = response.xpath( '//div[@class="m m1"]/div/ul/dt/li/text()').extract( )[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: classification_one = response.xpath( "//div[@class='breadcrumb']/a[2]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() classification_two = response.xpath( "//div[@class='breadcrumb']/a[3]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() classification_three = response.xpath( "//div[@class='breadcrumb']/a[4]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass classification = classification_one + '|||' + classification_two + '|||' + classification_three item['productClassification'] = classification try: #去空格 转分 去人民币符号 item['productPrice'] = response.xpath( '//strong[@class="p-price"]/font/text()').extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass try: item['productPrice'] = str( float( filter(lambda ch: ch in '0123456789.~', item['productPrice'])) * 100) except: pass #图片连接 try: item[ 'productImagePath'] = "http://www.91yilong.com" + response.xpath( '//div[@class="goods_img"]/a/@href').extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass #print item['image_urls'],"777777" try: item['productAddres'] = "" except: pass try: item['productCompany'] = "" except: pass names = self.name + '.json' try: item['fileName'] = names except: pass list_details = response.xpath( "//div[@id='para']/table[1]/tbody[2]/tr/td/text()").extract() logging.info("-------list_details_len=%i" % len(list_details)) list_pack = response.xpath( "//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()" ).extract() intro = response.xpath( "//div[@id='con_goods_1']/ul[@class='detail-list']/li/text()" ).extract() logging.info("-------intr_len=%i" % len(intro)) speci = response.xpath( "//div[@id='con_goods_2']/ul/li/text()").extract() logging.info("-------intr_len=%i" % len(speci)) num_one = 1 for value_details in list_details: value_details = value_details.encode('utf-8').replace( ":", "\/").replace("\n", "").replace("\"", "").strip() if num_one % 2 == 2: num_one = 1 continue if num_one % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '品牌' in value_details: num_one = 0 continue if '型号' in value_details: num_one = 0 continue data2['attrkey'] = value_details else: if num_one == 0: num_one = 1 continue data2['keyname'] = value_details details_list.append(data2) num_one += 1 num_two = 1 for list_intro in intro: list_intro = list_intro.encode('utf-8').replace("\n", "").replace( "\"", "").strip() list_intro = list_intro.split(':') for value_intro in list_intro: if num_two % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_intro: break if '品牌' in value_intro: break data2['attrkey'] = value_intro else: data2['keyname'] = value_intro intro_list.append(data2) num_two += 1 num_three = 1 for list_speci in speci: list_speci = list_speci.encode('utf-8').replace("\n", "").replace( "\"", "").strip() list_speci = list_speci.split(':') for value_speci in list_speci: if num_three % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_speci: break if '品牌' in value_speci: break data2['attrkey'] = value_speci else: data2['keyname'] = value_speci speci_list.append(data2) num_three += 1 item['productSpeci'] = speci_list item['productPack'] = pack_list item['productIntro'] = intro_list item['productDetails'] = details_list yield item
def parse(self, response): item = BaseItem() speci_list = [] pack_list = [] intro_list = [] details_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' print "PhantomJS is starting..." driver = webdriver.PhantomJS() # driver = webdriver.Chrome() driver.get(response.url) time.sleep(3) body = driver.page_source #driver.close() HtmlResponses = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=response) try: item['productUrl'] = response.url except: pass try: item['productName'] = HtmlResponses.xpath("//div[@class='detail-goods-right']/h1[@class='detail-goods-right-head ft18 J_title']/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass try: item['productBrand'] = HtmlResponses.xpath("//div[@class='main-width bread-top-main J_bread']/a[2]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() if '京东' in item['productBrand']: item['productBrand'] = '' except: pass try: item['productModel'] = HtmlResponses.xpath("//div[@class='detail-goods-right']/div[@class='detail-goods-right-list m-top15 J_goods']/span[2]/label/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass try: classification_one = response.xpath("//div[@id='ur_here']/a[2]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() classification_two = response.xpath("//div[@id='ur_here']/a[3]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() classification_three = response.xpath("//div[@id='ur_here']/a[4]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() item['productClassification'] = classification_one + '|||' + classification_two + '|||' +classification_three except: pass try: #去空格 转分 去人民币符号 item['productPrice'] = HtmlResponses.xpath("//div[@class='detail-goods-right']/div[@class='detail-goods-price m-top15']/ul/li[1]/label[@class='ft24 a weight J_salePrice']/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass try: item['productPrice'] = str(float(filter(lambda ch: ch in '0123456789.~', item['productPrice']))*100) except: pass #图片连接 try: item['productImagePath'] = HtmlResponses.xpath('//div[@class="detail-goods-left"]/div[1]/img/@src').extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass #print item['image_urls'],"777777" try: item['productAddres'] = response.xpath("//form[@id='form1']/ul/li[4]/text()").extract()[0] except: pass try: item['productCompany'] = "" except: pass names = self.name+'.json' try: item['fileName'] = names except: pass list_details = HtmlResponses.xpath("//div[@class='J_shows']/table/tbody/tr/td/text()").extract() logging.info("-------list_details_len=%i" %len(list_details)) list_pack = response.xpath("//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()").extract() intro = response.xpath("//span[@id='PDescriptiion']/text()").extract() logging.info("-------intr_len=%i" %len(intro)) speci = response.xpath("//span[@id='techParam']/text()").extract() logging.info("-------intr_len=%i" %len(speci)) num_one=1 for value_details in list_details : value_details = value_details.encode('utf-8').replace(":","\/").replace("\n","").replace("\"","").strip() if num_one%2==2 : num_one = 1 continue if num_one%2==1 : data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '品牌' in value_details: num_one=0 continue if '型号' in value_details: num_one=0 continue data2['attrkey']=value_details else: if num_one ==0: num_one = 1 continue data2['keyname']=value_details details_list.append(data2) num_one+=1 num_two=1 for list_intro in intro: list_intro = list_intro.encode('utf-8').replace("\n","").replace("\"","").strip() list_intro = list_intro.split(':') for value_intro in list_intro : if num_two%2==1 : data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_intro: break if '品牌' in value_intro: break data2['attrkey']=value_intro else: data2['keyname']=value_intro intro_list.append(data2) num_two+=1 num_three=1 for list_speci in speci: list_speci = list_speci.encode('utf-8').replace("\n","").replace("\"","").strip() list_speci = list_speci.split(':') for value_speci in list_speci : if num_three%2==1 : data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_speci: break if '品牌' in value_speci: break data2['attrkey']=value_speci else: data2['keyname']=value_speci speci_list.append(data2) num_three+=1 item['productSpeci'] = speci_list item['productPack'] = pack_list item['productIntro'] = intro_list item['productDetails'] = details_list yield item
def parse(self, response): item = BaseItem() speci_list = [] pack_list = [] intro_list = [] details_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' try: item['productUrl'] = response.url except: pass try: item['productName'] = response.xpath("//div[@class='goods_content_a_l_r f_l']/div[@class='title']/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass try: item['productBrand'] = response.xpath("//div[@class='goods_content_a_l_r f_l']/div[4]/span[@class='brand']/a/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass try: item['productModel'] = response.xpath('//div[@class="m m1"]/div/ul/dt/li/text()').extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass try: classification_one = response.xpath("//div[@class='position w1000']/a[2]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() classification_two = response.xpath("//div[@class='position w1000']/a[3]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() classification_three = response.xpath("//div[@class='position w1000']/a[4]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass classification = classification_one + '|||' + classification_two + '|||' +classification_three item['productClassification'] = classification try: #去空格 转分 去人民币符号 item['productPrice'] = response.xpath("//span[@class='goods_price weiruanyahei']/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass try: item['productPrice'] = str(float(filter(lambda ch: ch in '0123456789.~', item['productPrice']))*100) except: pass #图片连接 try: item['productImagePath'] = "http://www.deppre.cn/" + response.xpath('//div[@class="img_center_div"]/div/a/img/@src').extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass #print item['image_urls'],"777777" try: item['productAddres'] = response.xpath("//div[@class='goods_content_a_l_r f_l']/div[@class='score'][1]/span[@class='brand']/text()").extract()[0] except: pass try: item['productCompany'] = "" except: pass names = self.name+'.json' try: item['fileName'] = names except: pass list_details = response.xpath("//div[@id='para']/table[1]/tbody[2]/tr/td/text()").extract() logging.info("-------list_details_len=%i" %len(list_details)) list_pack = response.xpath("//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()").extract() intro = response.xpath("//span[@id='PDescriptiion']/text()").extract() logging.info("-------intr_len=%i" %len(intro)) speci = response.xpath("//span[@id='techParam']/text()").extract() logging.info("-------intr_len=%i" %len(speci)) num_one=1 for value_details in list_details : value_details = value_details.encode('utf-8').replace(":","\/").replace("\n","").replace("\"","").strip() if num_one%2==2 : num_one = 1 continue if num_one%2==1 : data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '品牌' in value_details: num_one=0 continue if '型号' in value_details: num_one=0 continue data2['attrkey']=value_details else: if num_one ==0: num_one = 1 continue data2['keyname']=value_details details_list.append(data2) num_one+=1 num_two=1 for list_intro in intro: list_intro = list_intro.encode('utf-8').replace("\n","").replace("\"","").strip() list_intro = list_intro.split(':') for value_intro in list_intro : if num_two%2==1 : data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_intro: break if '品牌' in value_intro: break data2['attrkey']=value_intro else: data2['keyname']=value_intro intro_list.append(data2) num_two+=1 num_three=1 for list_speci in speci: list_speci = list_speci.encode('utf-8').replace("\n","").replace("\"","").strip() list_speci = list_speci.split(':') for value_speci in list_speci : if num_three%2==1 : data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_speci: break if '品牌' in value_speci: break data2['attrkey']=value_speci else: data2['keyname']=value_speci speci_list.append(data2) num_three+=1 product_intro = response.xpath("//div[@class='goods_content_c_r_b_b clearfix']/span/text()").extract() product_pack = response.xpath("//td[@id='imgDiv']/div[@id='div3']/font/b/text()").extract() filename = self.name+".txt" file = open("data/"+filename, 'a+') file.write("\n"+"productUrl:"+response.url+"\n") file.write("productIntro:"+"\n") for intro in product_intro: intro = intro.encode('utf-8').replace("\b","").replace("<br/>","").replace("<br>","").strip() file.write(intro+"\n") file.close() item['productSpeci'] = speci_list item['productPack'] = pack_list item['productIntro'] = intro_list item['productDetails'] = details_list yield item
def parse(self, response): p = open('aa.html', 'a+') p.write(response.body) item = BaseItem() details_list = [] pack_list = [] intro_list = [] speci_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' print "PhantomJS is starting1..." driver = webdriver.PhantomJS() driver.get(response.url) #time.sleep(3) body = driver.page_source #driver.close() HtmlResponses = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=response) try: item['productUrl'] = response.url except: pass try: item['productName'] = HtmlResponses.xpath( "//h1[@class='ware_title']/a/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass brand = '' try: brand = HtmlResponses.xpath("//div[@class='ware_text']/div/text()" )[3].extract().encode('utf-8').replace( "\r\n", "\'").strip() except: pass item['productBrand'] = brand model = '' try: model = HtmlResponses.xpath("//div[@class='ware_text']/div/text()" ).extract()[6].encode('utf-8').strip() if '型号' in model: item['productModel'] = HtmlResponses.xpath( "//div[@class='ware_text']/div/text()").extract( )[6].encode('utf-8').replace("型号:", "").strip() else: item['productModel'] = HtmlResponses.xpath( "//div[@class='ware_text']/div/text()").extract( )[5].encode('utf-8').replace("型号:", "").strip() except: pass try: classification_one = HtmlResponses.xpath( "//div[@id='head']/div[@id='path']/a[2]/text()").extract( )[0].encode('utf-8').replace("\"", "\'").strip() classification_two = HtmlResponses.xpath( "//div[@id='head']/div[@id='path']/a[3]/text()").extract( )[0].encode('utf-8').replace("\"", "\'").strip() classification_three = HtmlResponses.xpath( "//div[@id='head']/div[@id='path']/a[4]/text()").extract( )[0].encode('utf-8').replace("\"", "\'").strip() except: pass classification = classification_one + '|||' + classification_two + '|||' + classification_three if classification not in self.data1: self.data1.append(classification) item['productClassification'] = classification try: item['productPrice'] = response.xpath( "//div[@class='rate']/span[@class='fontColor3'][2]/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: item['productPrice'] = str( float( filter(lambda ch: ch in '0123456789.~', item['productPrice'])) * 100) except: pass try: item['productImagePath'] = response.xpath( '//span[@class="jqzoom"]/img/@src').extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass #print item['image_urls'],"777777" try: item['productAddres'] = "" except: pass try: item['productCompany'] = "" except: pass names = self.name + '.json' try: item['fileName'] = names except: pass list_details = HtmlResponses.xpath( "//div[@id='para']/table[1]/tbody[2]/tr/td/text()").extract() logging.info("-------list_details_len=%i" % len(list_details)) list_pack = HtmlResponses.xpath( "//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()" ).extract() intro = HtmlResponses.xpath( "//div[@id='para']/p[2]/font/text()").extract() logging.info("-------intr_len=%i" % len(intro)) driver.close num_one = 1 for value_details in list_details: value_details = value_details.encode('utf-8').replace( ":", "\/").replace("\n", "").replace("\"", "").strip() if num_one % 2 == 2: num_one = 1 continue if num_one % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '品牌' in value_details: num_one = 0 continue if '型号' in value_details: num_one = 0 continue data2['attrkey'] = value_details else: if num_one == 0: num_one = 1 continue data2['keyname'] = value_details details_list.append(data2) num_one += 1 num_two = 1 for list_intro in intro: list_intro = list_intro.encode('utf-8').replace(":", "\/").replace( "\n", "").replace("\"", "").strip() list_intro = list_intro.split(':') for value_intro in list_intro: if num_two % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' data2['attrkey'] = value_intro else: data2['keyname'] = value_intro intro_list.append(data2) num_two += 1 item['productSpeci'] = speci_list item['productPack'] = pack_list item['productIntro'] = intro_list item['productDetails'] = intro_list yield item
def parse(self, response): item = BaseItem() speci_list = [] pack_list = [] intro_list = [] details_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' try: item['productUrl'] = response.url except: pass try: item['productName'] = response.xpath( "//form[@id='ECS_FORMBUY']/ul/li[1]/dd/div/text()").extract( )[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: item['productBrand'] = response.xpath( "//ul[@class='ul1']/li[@class='clearfix'][2]/dd/div[@class='f_r goos_news']/a/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: item['productModel'] = response.xpath( "//ul[@class='ul1']/li[@class='clearfix'][3]/dd/div[@class='f_l']/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: classification_one = response.xpath( "//div[@id='ur_here']/a[2]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() classification_two = response.xpath( "//div[@id='ur_here']/a[3]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() classification_three = response.xpath( "//div[@id='ur_here']/a[4]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() item[ 'productClassification'] = classification_one + '|||' + classification_two + '|||' + classification_three except: pass try: item['productPrice'] = response.xpath( "//font[@id='ECS_SHOPPRICE']/text()").extract()[1].encode( 'utf-8').replace("\"", "\'").strip() except: pass try: item['productPrice'] = str( float( filter(lambda ch: ch in '0123456789.~', item['productPrice'])) * 100) except: pass #ͼƬÁ¬½Ó try: item[ 'productImagePath'] = "http://www.sssmro.com/" + response.xpath( '//div[@id="preview"]/div[@class="jqzoom"]/img/@src' ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass #print item['image_urls'],"777777" try: item['productAddres'] = "" except: pass try: item['productCompany'] = "" except: pass names = self.name + '.json' try: item['fileName'] = names except: pass test_specis = response.xpath( "//div[@id='main1']/blockquote[2]/div[@class='qyjstxt']/text()" ).extract() logging.info("-------specis_len=%i" % len(test_specis)) test_details = response.xpath( "//blockquote[@class='block']/div[@class='qyjstxt']/text()" ).extract() logging.info("-------details_len=%i" % len(test_details)) specis = '' try: specis = response.xpath( "//div[@id='main1']/blockquote[2]/div[@class='qyjst']/text()" ).extract()[0].encode('utf-8').replace("\n", "").replace("\"", "").strip() except: pass list_speci = specis.split('£º') list_pack = response.xpath( "//div[@class='packageParameter tabContent']/div[@class='specisParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()" ).extract() num_one = 1 for speci in list_speci: if num_one % 2 == 0: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' data2['attrkey'] = speci else: if num_one == 1: num_one += 1 continue data2['keyname'] = speci speci_list.append(data2) num_one += 1 num_two = 1 for value_pack in list_pack: value_pack = value_pack.encode('utf-8').replace(":", "\/").replace( "\n", "").replace("\"", "").strip() if num_two % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' data2['attrkey'] = value_pack else: data2['keyname'] = value_pack pack_list.append(data2) num_two += 1 product_intro = response.xpath( "//div[@class='formwork_bt'][1]/p/text()").extract() filename = self.name + ".txt" file = open("data/" + filename, 'a+') file.write("\n" + "productUrl:" + response.url + "\n") file.write("productIntro:" + "\n") for intro in product_intro: intro = intro.encode('utf-8').replace("\"", "").strip() file.write(intro + "\n") file.close() item['productSpeci'] = speci_list item['productPack'] = pack_list item['productIntro'] = intro_list item['productDetails'] = intro_list yield item
def parse(self, response): item = BaseItem() speci_list = [] pack_list = [] intro_list = [] details_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' try: item['productUrl'] = response.url except: pass try: item['productName'] = response.xpath( "//div[@id='name']/h1/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass try: item['productBrand'] = response.xpath( "//li[@id='summary-brand']/div[@class='dd']/a/em[@class='hl_red bold']/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: item['productModel'] = response.xpath( '//div[@class="m m1"]/div/ul/dt/li/text()').extract( )[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: classification_one = response.xpath( "//div[@id='part_content']/div[@class='node_path']/a[3]/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() classification_two = response.xpath( "//div[@id='part_content']/div[@class='node_path']/a[4]/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() classification_three = response.xpath( "//div[@id='part_content']/div[@class='node_path']/a[5]/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass classification = classification_one + '|||' + classification_two + '|||' + classification_three item['productClassification'] = classification try: #去空格 转分 去人民币符号 item['productPrice'] = response.xpath( "//li[@id='summary-price']/div[@class='dd']/strong/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: item['productPrice'] = str( float( filter(lambda ch: ch in '0123456789.~', item['productPrice'])) * 100) except: pass #图片连接 try: item['productImagePath'] = response.xpath( '//div[@id="spec-n1"]/a/@href').extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass #print item['image_urls'],"777777" try: item['productAddres'] = response.xpath( "//div[@class='goods_content_a_l_r f_l']/div[@class='score'][1]/span[@class='brand']/text()" ).extract()[0] except: pass try: item['productCompany'] = "" except: pass names = self.name + '.json' try: item['fileName'] = names except: pass test_specis = response.xpath( "//div[@id='main1']/blockquote[2]/div[@class='qyjstxt']/text()" ).extract() logging.info("-------specis_len=%i" % len(test_specis)) test_details = response.xpath( "//blockquote[@class='block']/div[@class='qyjstxt']/text()" ).extract() logging.info("-------details_len=%i" % len(test_details)) specis = '' try: specis = response.xpath( "//div[@id='main1']/blockquote[2]/div[@class='qyjst']/text()" ).extract()[0].encode('utf-8').replace("\n", "").replace("\"", "").strip() except: pass list_speci = specis.split(':') list_pack = response.xpath( "//div[@class='packageParameter tabContent']/div[@class='specisParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()" ).extract() num_one = 1 for speci in list_speci: if num_one % 2 == 0: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' data2['attrkey'] = speci else: if num_one == 1: num_one += 1 continue data2['keyname'] = speci speci_list.append(data2) num_one += 1 num_two = 1 for value_pack in list_pack: value_pack = value_pack.encode('utf-8').replace(":", "\/").replace( "\n", "").replace("\"", "").strip() if num_two % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' data2['attrkey'] = value_pack else: data2['keyname'] = value_pack pack_list.append(data2) num_two += 1 product_intro = response.xpath( "//div[@id='content_product']/div[@class='property']/text()" ).extract() filename = self.name + ".txt" file = open("data/" + filename, 'a+') file.write("\n" + "productUrl:" + response.url + "\n") file.write("productIntro:" + "\n") for intro in product_intro: intro = intro.encode('utf-8').replace("\"", "").strip() file.write(intro + "\n") file.close() item['productSpeci'] = speci_list item['productPack'] = pack_list item['productIntro'] = intro_list item['productDetails'] = intro_list yield item
def parse(self, response): item = BaseItem() speci_list = [] pack_list = [] intro_list = [] details_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' print "PhantomJS is starting1..." #driver = webdriver.PhantomJS() driver = webdriver.PhantomJS() driver.get(response.url) time.sleep(3) body = driver.page_source #driver.close() HtmlResponses = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=response) try: item['productUrl'] = response.url except: pass try: item['productName'] = HtmlResponses.xpath( "//h1[@id='comTitle']/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass list_brand = '' for j in range(1, 20): try: list_brand = response.xpath( "//div[@id='pdetail']/div/table/tr[%i]/th/h4/text()" % j).extract()[0].encode('utf-8').replace("\"", "\'").strip() if '品牌:' in list_brand: item['productBrand'] = response.xpath( "//div[@id='pdetail']/div/table/tr[%i]/td/text()" % j).extract()[0].encode('utf-8').replace("\"", "\'").strip() break except: pass list_model = '' for j in range(1, 20): try: list_model = response.xpath( "//div[@id='pdetail']/div/table/tr[%i]/th/h4/text()" % j).extract()[0].encode('utf-8').replace("\"", "\'").strip() if '型号:' in list_model: item['productModel'] = response.xpath( "//div[@id='pdetail']/div/table/tr[%i]/td/text()" % j).extract()[0].encode('utf-8').replace("\"", "\'").strip() break except: pass try: classification_one = response.xpath( "//div[@id='head']/div[@id='path']/a[2]/text()").extract( )[0].encode('utf-8').replace("\"", "\'").strip() classification_two = response.xpath( "//div[@id='head']/div[@id='path']/a[3]/text()").extract( )[0].encode('utf-8').replace("\"", "\'").strip() classification_three = response.xpath( "//div[@id='head']/div[@id='path']/a[4]/text()").extract( )[0].encode('utf-8').replace("\"", "\'").strip() except: pass classification = classification_one + '|||' + classification_two + '|||' + classification_three item['productClassification'] = classification try: #去空格 转分 去人民币符号 item['productPrice'] = response.xpath( "//div[@id='oriPriceTop']/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass try: item['productPrice'] = str( float( filter(lambda ch: ch in '0123456789.~', item['productPrice'])) * 100) except: pass #图片连接 try: item['productImagePath'] = HtmlResponses.xpath( "//a[@id='imgContainer']/@hrefs").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass #print item['image_urls'],"777777" try: item['productAddres'] = response.xpath( "//div[@id='pdetail']/div[@class='d-vopy']/table/tbody/tr[4]/td/text()" ).extract()[0] except: pass try: item['productCompany'] = "" except: pass names = self.name + '.json' try: item['fileName'] = names except: pass item['productAddres'] = "" item['productCompany'] = "" names = self.name + '.json' item['fileName'] = names list_details = response.xpath( "//div[@class='d-vopy']/table/tr/th/h4/text()").extract() logging.info("-------list_details_len=%i" % len(list_details)) details = response.xpath( "//div[@class='d-vopy']/table/tr/td/text()").extract() logging.info("-------details_len=%i" % len(details)) list_pack = response.xpath( "//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()" ).extract() list_intro = response.xpath( "//table[@class='goods-items']/tr[1]/th/text()").extract() logging.info("-------list_intro_len=%i" % len(list_intro)) intro = response.xpath( "//div[@class='goods']/table[@class='goods-items']/tr[2]/td/text()" ).extract() logging.info("-------intr_len=%i" % len(intro)) speci = response.xpath("//span[@id='techParam']/text()").extract() logging.info("-------intr_len=%i" % len(speci)) num_one = 0 for list_details_value in list_details: list_details_value = list_details_value.encode('utf-8').replace( "\n", "").replace("\"", "").strip() data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '品牌' in list_details_value: num_one += 1 continue if '价格' in list_details_value: num_one += 1 continue if '供应商' in list_details_value: num_one = 0 continue if '保修期' in list_details_value: break data2['attrkey'] = list_details_value data2['keyname'] = details[num_one] details_list.append(data2) num_one += 1 num_two = 0 for list_intro_value in list_intro: list_intro_value = list_intro_value.encode('utf-8').replace( "\n", "").replace("\"", "").strip() data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '品牌' in list_intro_value: num_two += 1 continue if '价格' in list_intro_value: num_two += 1 continue if '供应商' in list_intro_value: num_two = 0 continue if '保修期' in list_intro_value: break data2['attrkey'] = list_intro_value data2['keyname'] = intro[num_two] intro_list.append(data2) num_two += 1 num_three = 1 for list_speci in speci: list_speci = list_speci.encode('utf-8').replace("\n", "").replace( "\"", "").strip() list_speci = list_speci.split(':') for value_speci in list_speci: if num_three % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_speci: break if '品牌' in value_speci: break data2['attrkey'] = value_speci else: data2['keyname'] = value_speci speci_list.append(data2) num_three += 1 item['productSpeci'] = speci_list item['productPack'] = pack_list item['productIntro'] = intro_list item['productDetails'] = details_list yield item
def parse(self, response): item = BaseItem() speci_list = [] pack_list = [] intro_list = [] details_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' try: classification_one = response.xpath("//div[@class='g_position']/a[2]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() classification_two = response.xpath("//div[@class='g_position']/a[3]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() classification_three = response.xpath("//div[@class='g_position']/a[4]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass classification = classification_one + '|||' + classification_two + '|||' +classification_three try: item['productUrl'] = response.url except: pass try: item['productName'] = response.xpath('//div[@class="pd_l_cont clearfix"]/div[1]/a/h1/text()').extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass brand = '' list_brand = [] for j in range(1,20): try: list_brand = response.xpath("//div[@class='pd_param clearfix']/ul/li[%i]/text()" %j).extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass if "品牌" in list_brand: brand = response.xpath("//div[@class='pd_param clearfix']/ul/li[%i]/span/text()" %j).extract()[0].encode('utf-8').replace("\"","\'").strip() break item['productBrand'] = brand try: item['productModel'] = response.xpath("//div[@class='pd_param clearfix']/ul/li[2]/span/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass try: item['productClassification'] = classification except: pass try: #去空格 转分 去人民币符号 item['productPrice'] = response.xpath("//div[@class='pd_l clearfix']/div[@class='pd_l_cont clearfix'][1]/div[@class='pd_info']/ul[@class='pd_info_supplier']/li[4]//span[@class='f_red']/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() if item['productPrice'] == '面议': item['productPrice'] = 0.0 except: pass try: item['productPrice'] = str(float(filter(lambda ch: ch in '0123456789.~', item['productPrice']))*100) except: pass #图片连接 try: item['productImagePath'] = response.xpath("//img[@id='show_big']/@src").extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass #print item['image_urls'],"777777" try: item['productAddres'] = response.xpath("//span[@class='address_over']/a/text()").extract()[0] except: pass try: item['productCompany'] = "" except: pass names = self.name+'.json' try: item['fileName'] = names except: pass list_details = response.xpath("//div[@class='pd_param clearfix']/ul/li/text()").extract() details = response.xpath("//div[@class='pd_param clearfix']/ul/li/span/text()").extract() logging.info("-------list_details_len=%i" %len(list_details)) list_pack = response.xpath("//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()").extract() intro = response.xpath("//span[@id='PDescription']/text()").extract() logging.info("-------intr_len=%i" %len(intro)) speci = response.xpath("//span[@id='techParam']/text()").extract() logging.info("-------intr_len=%i" %len(speci)) num_one=0 for value_details in list_details : value_details = value_details.encode('utf-8').replace(":","\/").replace("\n","").replace("\"","").strip() if '品牌' in value_details: num_one+=1 continue if '型号' in value_details: num_one+=1 continue else: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' data2['attrkey']=value_details data2['keyname']=details[num_one] details_list.append(data2) num_one+=1 num_two=1 for list_intro in intro: list_intro = list_intro.encode('utf-8').replace("\n","").replace("\"","").strip() list_intro = list_intro.split(':') for value_intro in list_intro : if num_two%2==1 : data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_intro: break if '品牌' in value_intro: break data2['attrkey']=value_intro else: data2['keyname']=value_intro intro_list.append(data2) num_two+=1 num_three=1 for list_speci in speci: list_speci = list_speci.encode('utf-8').replace("\n","").replace("\"","").strip() list_speci = list_speci.split(':') for value_speci in list_speci : if num_three%2==1 : data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_speci: break if '品牌' in value_speci: break data2['attrkey']=value_speci else: data2['keyname']=value_speci speci_list.append(data2) num_three+=1 item['productSpeci'] = speci_list item['productPack'] = pack_list item['productIntro'] = intro_list item['productDetails'] = details_list yield item
def parse(self, response): item = BaseItem() speci_list = [] pack_list = [] intro_list = [] details_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' try: item['productUrl'] = response.url except: pass try: item['productName'] = response.xpath( "//h1[@class='lh40 col59 f18']/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass list_brand = '' try: list_brand = response.xpath( "//tr[@class='keyValue'][1]/td[1]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() if '涂层手套' not in list_brand: brand = response.xpath( "//div[@class='detailAndBuy']/div[@class='detail'][1]/span[@class='typeValue']/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() item['productBrand'] = brand.split(' ')[0] else: item['productBrand'] = response.xpath( "//tr[@class='keyValue'][1]/td[2]/text()").extract( )[0].encode('utf-8').replace("\"", "\'").strip() filter(str.isalnum, item['productBrand']) except: pass try: item['productModel'] = response.xpath( "//div[@class='cpzstm']/b/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass try: classification_one = response.xpath( "//div[@class='crumbs']/span[2]/a/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() classification_two = response.xpath( "//div[@class='crumbs']/span[3]/a/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() classification_three = response.xpath( "//div[@class='crumbs']/span[4]/a/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass classification = classification_one + '|||' + classification_two + '|||' + classification_three item['productClassification'] = classification try: item['productPrice'] = response.xpath( "//span[@id='show-price']/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass try: item['productPrice'] = str( float( filter(lambda ch: ch in '0123456789.~', item['productPrice'])) * 100) except: pass try: item[ 'productImagePath'] = 'http://www.zhaogongye.cn' + response.xpath( "//span[@class='jqzoom']/img/@src").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass try: item['productAddres'] = "" except: pass try: item['productCompany'] = "" except: pass names = self.name + '.json' try: item['fileName'] = names except: pass test_specis = response.xpath( "//div[@id='main1']/blockquote[2]/div[@class='qyjstxt']/text()" ).extract() logging.info("-------specis_len=%i" % len(test_specis)) test_details = response.xpath( "//blockquote[@class='block']/div[@class='qyjstxt']/text()" ).extract() logging.info("-------details_len=%i" % len(test_details)) specis = '' try: specis = response.xpath( "//div[@id='main1']/blockquote[2]/div[@class='qyjst']/text()" ).extract()[0].encode('utf-8').replace("\n", "").replace("\"", "").strip() except: pass list_speci = specis.split(':') list_pack = response.xpath( "//div[@class='packageParameter tabContent']/div[@class='specisParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()" ).extract() num_one = 1 for speci in list_speci: if num_one % 2 == 0: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' data2['attrkey'] = speci else: if num_one == 1: num_one += 1 continue data2['keyname'] = speci speci_list.append(data2) num_one += 1 num_two = 1 for value_pack in list_pack: value_pack = value_pack.encode('utf-8').replace(":", "\/").replace( "\n", "").replace("\"", "").strip() if num_two % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' data2['attrkey'] = value_pack else: data2['keyname'] = value_pack pack_list.append(data2) num_two += 1 product_details = response.xpath( "//blockquote[@class='block']/div[@class='qyjstxt']/text()" ).extract() product_speci = response.xpath( "//div[@id='main1']/blockquote[2]/div[@class='qyjstxt']/text()" ).extract() filename = self.name + ".txt" file = open("data/" + filename, 'a+') file.write("\n" + "productUrl:" + response.url + "\n") file.write("productDetails:" + "\n") for details in product_details: details = details.encode('utf-8').replace("\b", "").replace( "<br/>", "").replace("<br>", "").strip() file.write(details + "\n") file.write("productSpeci:" + "\n") for speci in product_speci: speci = speci.encode('utf-8').replace("\"", "").strip() file.write(speci + "\n") file.close() item['productSpeci'] = speci_list item['productPack'] = pack_list item['productIntro'] = intro_list item['productDetails'] = intro_list yield item
def parse(self, response): item = BaseItem() speci_list = [] pack_list = [] intro_list = [] details_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' try: classification_one = response.xpath( "//div[@class='breadth']/div/a[3]/text()").extract()[0].encode( 'utf-8').replace(".", "").strip() classification_two = response.xpath( "//div[@class='position w1000']/a[3]/text()").extract( )[0].encode('utf-8').replace("\"", "\'").strip() classification_three = response.xpath( "//div[@class='position w1000']/a[4]/text()").extract( )[0].encode('utf-8').replace("\"", "\'").strip() except: pass classification = classification_one + '|||' + classification_two + '|||' + classification_three #print "PhantomJS is starting1..." #driver = webdriver.PhantomJS() #driver.get(response.url) #time.sleep(3) #body = driver.page_source #driver.close() #HtmlResponses = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=response) try: item['productUrl'] = response.url except: pass try: item['productName'] = response.xpath( "//div[@class='corpus_left']/div/h3/text()").extract( )[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: item['productBrand'] = response.xpath( "//div[@class='intro']/table/tr[4]/td[2]/text()").extract( )[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: item['productModel'] = response.xpath( "//div[@class='intro']/table/tr[1]/td[2]/text()").extract( )[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: item['productClassification'] = classification except: pass try: #去空格 转分 去人民币符号 item['productPrice'] = response.xpath( "//div[@class='intro']/table/tr[3]/td[2]/b/text()").extract( )[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: item['productPrice'] = str( float( filter(lambda ch: ch in '0123456789.~', item['productPrice'])) * 100) except: pass #图片连接 try: item[ 'productImagePath'] = "http://www.rolymro.com/" + response.xpath( "//div[@class='sphoto']/a[@class='sphoto']/img/@src" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass #print item['image_urls'],"777777" try: item['productAddres'] = response.xpath( "//div[@id='pdetail']/div[@class='d-vopy']/table/tbody/tr[4]/td/text()" ).extract()[0] except: pass try: item['productCompany'] = "" except: pass names = self.name + '.json' try: item['fileName'] = names except: pass list_details = response.xpath( "//table[@class='table_pro_info']/tr/td[1]/text()").extract() details = response.xpath( "//table[@class='table_pro_info']/tr/td[2]/text()").extract() logging.info("-------list_details_len=%i" % len(list_details)) logging.info("-------details_len=%i" % len(details)) list_pack = response.xpath( "//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()" ).extract() intro = response.xpath("//span[@id='PDescription']/text()").extract() logging.info("-------intr_len=%i" % len(intro)) speci = response.xpath("//span[@id='techParam']/text()").extract() logging.info("-------intr_len=%i" % len(speci)) num_one = 0 for value_details in list_details: value_details = value_details.encode('utf-8').replace( ":", "\/").replace("\n", "").replace("\t", "").replace("\b", "").strip() data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '产品单价' in value_details: num_one += 2 continue if '品牌' in value_details: num_one += 1 continue if '型号' in value_details: num_one += 1 continue data2['attrkey'] = value_details data2['keyname'] = details[num_one].encode('utf-8').replace( ":", "\/").replace("\n", "").replace("\t", "").replace("\b", "").strip() details_list.append(data2) num_one += 1 num_two = 1 for list_intro in intro: list_intro = list_intro.encode('utf-8').replace( "\r\n\t", "").replace("\"", "").strip() list_intro = list_intro.split(':') for value_intro in list_intro: if num_two % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_intro: break if '品牌' in value_intro: break data2['attrkey'] = value_intro else: data2['keyname'] = value_intro intro_list.append(data2) num_two += 1 num_three = 1 for list_speci in speci: list_speci = list_speci.encode('utf-8').replace("\n", "").replace( "\"", "").strip() list_speci = list_speci.split(':') for value_speci in list_speci: if num_three % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_speci: break if '品牌' in value_speci: break data2['attrkey'] = value_speci else: data2['keyname'] = value_speci speci_list.append(data2) num_three += 1 product_intro = response.xpath( "//span[@id='PDescription']/text()").extract() product_pack = response.xpath( "//td[@id='imgDiv']/div[@id='div3']/font/b/text()").extract() filename = self.name + ".txt" file = open("data/" + filename, 'a+') file.write("\n" + "productUrl:" + response.url + "\n") file.write("productIntro:" + "\n") for intro in product_intro: intro = intro.encode('utf-8').replace("\b", "").replace( "<br/>", "").replace("<br>", "").strip() file.write(intro + "\n") file.write("productPack:" + "\n") for pack in product_pack: pack = pack.encode('utf-8').replace("\"", "").strip() file.write(pack + "\n") file.close() item['productSpeci'] = speci_list item['productPack'] = pack_list item['productIntro'] = intro_list item['productDetails'] = details_list yield item
def parse(self, response): #Analytical framework sel = Selector(response) #Loop body speci_list = [] pack_list = [] intro_list = [] details_list = [] productClassification="" productName="" productBrand="" productModel="" productPrice="" productImagePath="" productAddres="" productId="" productCompany="" productPack="" price="" fileName="www_ispek_cn_data_info.json" #Instantiation CrawlertoolsItem object item=BaseItem() #Parse text productImagePath=sel.xpath(".//*[@id='picshower']/img/@src").extract()[0] if len(sel.xpath(".//div[@id='sample-table-2_wrapper']")) == 0: try: tempSel=sel.xpath(".//*[@id='main-container']/div/div[1]/div[1]/a") i=1 classificationStr="" while i < len(tempSel): classificationStr=classificationStr+tempSel[i].xpath("text()").extract()[0].strip()+"|||" i+=1 productClassification=classificationStr.rstrip("|||") tempSel=sel.xpath(".//*[@id='main-container']/div/div[1]/div[2]/div[2]") if len(tempSel.xpath("form")): tempSel=tempSel.xpath("form") temps=tempSel.xpath("div[1]/div[1]/span/text()")[0].extract() productPrice=temps.split("/")[0].strip() productPrice=filter(lambda ch: ch in '0123456789.', productPrice) productCompany=temps.split("/")[1] else: temps=tempSel.xpath("div[1]/div[1]/text()")[0].extract().strip() productPrice=filter(lambda ch: ch in '0123456789.~', temps).split("~") productCompany="" productName=tempSel.xpath("h2/text()").extract()[0] productBrand=tempSel.xpath("div[2]/div/div[1]/a/text()").extract()[0] tempSel=tempSel.xpath("div[2]/div/div") i=2 while i<len(tempSel): tempStr="" tempStr=tempSel[i].xpath("text()").extract()[0].strip().split(u":") if tempStr[0]==u"型号": productModel=tempStr[1] if tempStr[0]==u"产地": productAddres=tempStr[1] if tempStr[0]==u"包装": productPack=tempStr[1] i+=1 prices=productPrice.encode('utf-8').replace("\"","\'").strip().split("~") if len(prices) >1: price=str(float(prices[0])*100)+"~"+str(float(prices[1])*100) else: price=str(float(prices[0])*100) except Exception,e: print "-----------------yichang--------------->",e #Formatted data item['productUrl']=response.url item['productImagePath'] = "http://www.ispek.cn"+productImagePath.encode('utf-8').replace("\"","\'").strip() item['productClassification'] =productClassification.encode('utf-8').replace("\"","\'").strip() item['productName'] =productName.encode('utf-8').replace("\"","\'").strip() item['productBrand'] =productBrand.encode('utf-8').replace("\"","\'").strip() item['productModel'] =productModel.encode('utf-8').replace("\"","\'").strip() item['productAddres'] =productAddres.encode('utf-8').replace("\"","\'").strip() item['productCompany'] =productCompany.encode('utf-8').replace("\"","\'").strip() item['productPrice'] =price item['fileName']=fileName item['productDetails']="" item['productPack']=productPack.encode('utf-8').replace("\"","\'").strip() item['productIntro']="" item['productSpeci']="" yield item
>>>>>>> 7235bd3b1c4452496ce81c35a74b003805fe6394 ======= >>>>>>> 7235bd3b1c4452496ce81c35a74b003805fe6394 productName="" productBrand="" productModel="" productPrice="" productImagePath="" productId="" productCompany="" temps="" productDetails="" productPack="" fileName="zc_mrobay_com_data_info.json" #Instantiation CrawlertoolsItem object item=BaseItem() #Parse text productImagePath=sel.xpath(".//*[@id='showPic']/@src").extract()[0] try: tempSel=sel.xpath(".//div[@class='Xh_xq']") productName=tempSel.xpath("h1/text()").extract()[0] productBrand=tempSel.xpath("div[2]/div[2]/ul/li[3]/text()").extract()[0] temps=tempSel.xpath("div[2]/div[2]/ul/li[1]/span/b/text()")[0].extract() productPrice=str(float(filter(lambda ch: ch in '0123456789.~', temps))*100) productCompany=u"套" productModel=tempSel.xpath("div[2]/div[2]/ul/li[4]/p[1]/text()")[0].extract() except Exception,e: print "-----------------yichang--------------->",e try: productPack=sel.xpath(".//div[5]/div/div[2]/div[2]/ul/li[9]/text()").extract()[0]
def parse(self, response): item = BaseItem() speci_list = [] pack_list = [] intro_list = [] details_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' #print "PhantomJS is starting1..." #driver = webdriver.PhantomJS() #driver.get(response.url) #time.sleep(3) #body = driver.page_source #HtmlResponses = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=response) try: item['productUrl'] = response.url except: pass try: item['productName'] = response.xpath( "//h1[@id='title']/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass try: classification_one = response.xpath( "//div[@class='pos']/a[3]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() classification_two = response.xpath( "//div[@class='pos']/a[4]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() classification_three = response.xpath( "//div[@class='pos']/a[5]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass classification = classification_one + '|||' + classification_two + '|||' + classification_three item['productClassification'] = classification try: item['productPrice'] = response.xpath( "//span[@class='f_price px16']/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass try: item['productPrice'] = str( float( filter(lambda ch: ch in '0123456789.~', item['productPrice'])) * 100) except: pass try: imagePath = response.xpath("//div[@id='mid_div']/img/@src" ).extract()[0].encode('utf-8').replace( "\"", "\'").strip() item['productImagePath'] = imagePath except: pass try: item['productAddres'] = "" except: pass try: item['productCompany'] = "" except: pass names = self.name + '.json' try: item['fileName'] = names except: pass print "PhantomJS is starting1..." driver = webdriver.PhantomJS() driver.get(response.url) time.sleep(3) body = driver.page_source HtmlResponses = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=response) list_brand = HtmlResponses.xpath( "//table/tbody/tr/td[2]/table/tbody/tr/td/table/tbody/tr/td[1]" ).extract() list_speci_th = response.xpath( "//div[@class='col-xs-12 nopadding border-ccc attrDiv']/div/p/text()" ).extract() list_detail = response.xpath("//div[@id='content']").extract() logging.info("----------list_detail_len=%i" % len(list_detail)) list_intro = response.xpath( "//div[@id='tbc_13']/div[@class='intro_box']").extract() logging.info("----------list_intro_len=%i" % len(list_intro)) for j in range(1, len(list_brand)): brand = HtmlResponses.xpath( "//table/tbody/tr/td[2]/table/tbody/tr/td/table/tbody/tr[%i]/td[1]/text()" % j).extract()[0].encode('utf-8').replace(":", "\/").replace( "\n", "").replace("\"", "").strip() if "型号" in brand: item['productModel'] = HtmlResponses.xpath( "//table/tbody/tr[2]/td[3]/table/tbody/tr[%i]/td[2]/a[@class='b']/text()" % j).extract()[0].encode('utf-8').replace( ":", "\/").replace("\n", "").replace("\"", "").strip() continue if "品牌" in brand: item['productBrand'] = HtmlResponses.xpath( "//table/tbody/tr[2]/td[3]/table/tbody/tr[%i]/td[2]/a[@class='b']/text()" % j).extract()[0].encode('utf-8').replace( ":", "\/").replace("\n", "").replace("\"", "").strip() break driver.close() #11.28 brand = '' try: brand = response.xpath( "//dl[@class='pro-info-prop pro-info-brand']/dd[@class='pro-info-cons']/text()" ).extract()[0].encode('utf-8').replace("\t", "").replace( "\n", "").replace("\b", "").replace("\r", "").strip() item['productBrand'] = brand except: pass model = '' try: model = response.xpath( "//dl[@class='pro-info-prop pro-info-model']/dd[@class='pro-info-cons']/text()" ).extract()[0].encode('utf-8').replace("\t", "").replace( "\n", "").replace("\b", "").strip() item['productModel'] = model except: pass #11.28 for value_detail in list_detail: value_detail = value_detail.encode('utf-8').replace( "\t", "").replace("\n", "").replace("\b", "").replace( "<br>", "").replace("</br>", "").replace("\r", "").strip() dr = re.compile(r'<[^>]+>', re.S) dd_value_detail = dr.sub('', value_detail) details_list.append(dd_value_detail) cancel = '' try: cancel_l = response.xpath( "//p[@class='link-detail']/text()").extract() for cancel_s in cancel_l: cancel_s = cancel_s.encode('utf-8').replace("\t", "").replace( "\n", "").replace("\b", "").replace("<br>", "").replace( "</br>", "").replace("\r", "").strip() cancel += cancel_s except: pass for value_intro in list_intro: value_intro = value_intro.encode('utf-8').replace( "\t", "").replace("\n", "").replace("\b", "").replace( "<br>", "").replace("</br>", "").replace("\r", "").strip() value_intro.replace(cancel, '') dr = re.compile(r'<[^>]+>', re.S) dd_value_intro = dr.sub('', value_intro) intro_list.append(dd_value_intro) item['productSpeci'] = speci_list item['productPack'] = pack_list item['productIntro'] = intro_list item['productDetails'] = details_list yield item
def parse(self, response): item = BaseItem() speci_list = [] pack_list = [] intro_list = [] details_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' try: classification_one = response.xpath("//div[@class='w-fly-cnt']/div[@class='position']/a[2]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() classification_two = response.xpath("//div[@class='w-fly-cnt']/div[@class='position']/a[3]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() classification_three = response.xpath("//div[@class='w-fly-cnt']/div[@class='position']/a[4]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass classification = classification_one + '|||' + classification_two + '|||' +classification_three item['productUrl'] = response.url try: item['productName'] = response.xpath("//div[@class='prodetails']/h1[@class='protitle']/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass try: item['productBrand'] = response.xpath('//ul[@class="list_pic"]/li/dl/dt/a/text()').extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass try: item['productModel'] = response.xpath("//div[@class='fn-fr']/div[@class='add-to-basket']/dl[@class='fn-clearfix atb-dl-01']/dd/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass try: item['productClassification'] = classification except: pass try: price=response.xpath("//font[@id='ECS_GOODS_AMOUNT']/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass try: item['productPrice'] = str(float(filter(lambda ch: ch in '0123456789.~', price))*100) except: pass try: item['productImagePath'] = "http://www.isweek.cn/"+response.xpath('//a[@class="jqzoom"]/img/@src').extract()[0].encode('utf-8').replace("\"","\'").strip() except: pass item['productAddres'] = "" item['productCompany'] = "" names = self.name+'.json' item['fileName'] = names list_details = response.xpath("//div[@id='para']/table[1]/tbody[2]/tr/td/text()").extract() logging.info("-------list_details_len=%i" %len(list_details)) list_pack = response.xpath("//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()").extract() intro = response.xpath("//span[@id='PDescriptiion']/text()").extract() logging.info("-------intr_len=%i" %len(intro)) speci = response.xpath("//span[@id='techParam']/text()").extract() logging.info("-------intr_len=%i" %len(speci)) num_one=1 for value_details in list_details : value_details = value_details.encode('utf-8').replace(":","\/").replace("\n","").replace("\"","").strip() if num_one%2==2 : num_one = 1 continue if num_one%2==1 : data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '品牌' in value_details: num_one=0 continue if '型号' in value_details: num_one=0 continue data2['attrkey']=value_details else: if num_one ==0: num_one = 1 continue data2['keyname']=value_details details_list.append(data2) num_one+=1 num_two=1 for list_intro in intro: list_intro = list_intro.encode('utf-8').replace("\n","").replace("\"","").strip() list_intro = list_intro.split(':') for value_intro in list_intro : if num_two%2==1 : data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_intro: break if '品牌' in value_intro: break data2['attrkey']=value_intro else: data2['keyname']=value_intro intro_list.append(data2) num_two+=1 num_three=1 for list_speci in speci: list_speci = list_speci.encode('utf-8').replace("\n","").replace("\"","").strip() list_speci = list_speci.split(':') for value_speci in list_speci : if num_three%2==1 : data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_speci: break if '品牌' in value_speci: break data2['attrkey']=value_speci else: data2['keyname']=value_speci speci_list.append(data2) num_three+=1 product_intro = response.xpath("//div[@class='prodesc']/p/span/text()").extract() product_pack = response.xpath("//td[@id='imgDiv']/div[@id='div3']/font/b/text()").extract() product_speci = response.xpath("//div[@class='fn-fr']/div[@class='info']/p/text()").extract() filename = self.name+".txt" file = open("data/"+filename, 'a+') file.write("\n"+"productUrl:"+response.url+"\n") file.write("productIntro:"+"\n") for intro in product_intro: intro = intro.encode('utf-8').replace("\b","").replace("<br/>","").replace("<br>","").strip() file.write(intro+"\n") file.write("productSpeci:"+"\n") for speci in product_speci: speci = speci.encode('utf-8').replace("\"","").strip() file.write(speci+"\n") file.close() item['productSpeci'] = speci_list item['productPack'] = pack_list item['productIntro'] = intro_list item['productDetails'] = details_list yield item
def parse(self, response): item = BaseItem() speci_list = [] pack_list = [] intro_list = [] details_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' try: classification_one = response.xpath( "//div[@id='Public_breadCrumb01-110']/a[@class='arrowbg1']/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() classification_two = response.xpath( "//div[@id='Public_breadCrumb01-110']/a[@class='arrowbg2']/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() classification_three = response.xpath( "//div[@id='Public_breadCrumb01-110']/a[@class='arrowbg3']/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass classification = classification_one + '|||' + classification_two + '|||' + classification_three try: item['productUrl'] = response.url except: pass try: item['productName'] = response.xpath( "//div[@id='ProductDetail_basic01-101']/h1[@class='htmlinline']/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: item['productBrand'] = response.xpath( '//ul[@class="list_pic"]/li/dl/dt/a/text()').extract( )[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: item['productModel'] = response.xpath( "//li[@class='number']/em/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass try: item['productClassification'] = classification except: pass try: #去空格 转分 去人民币符号 item['productPrice'] = response.xpath( "//li[@class='retailprice']/em/strong").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass try: item['productPrice'] = str( float( filter(lambda ch: ch in '0123456789.~', item['productPrice'])) * 100) except: pass #图片连接 try: item[ 'productImagePath'] = "http://www.ieou.com.cn" + response.xpath( '//div[@class="jqzoom"]/img/@src').extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass #print item['image_urls'],"777777" try: item['productAddres'] = response.xpath( "//div[@id='pdetail']/div[@class='d-vopy']/table/tbody/tr[4]/td/text()" ).extract()[0] except: pass try: item['productCompany'] = "" except: pass names = self.name + '.json' try: item['fileName'] = names except: pass list_details = response.xpath( "//div[@class='extend']/ul/li/label/text()").extract() logging.info("-------list_details_len=%i" % len(list_details)) details = response.xpath( "//div[@class='extend']/ul/li/span/text()").extract() logging.info("-------details_len=%i" % len(details)) list_pack = response.xpath( "//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()" ).extract() intro = response.xpath( "//div[@class='descreibe htmledit']/ul/li/span/span/text()" ).extract() logging.info("-------intr_len=%i" % len(intro)) speci = response.xpath( "//div[@class='desceribe htmleit']/ul/li/span/span/span/text()" ).extract() logging.info("-------intr_len=%i" % len(speci)) num_one = 0 for value_details in list_details: value_details = value_details.encode('utf-8').replace( ":", "").replace("\n", "").replace("\"", "").strip() data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '品牌' in value_details: num_one += 1 continue if '型号' in value_details: num_one += 1 continue data2['attrkey'] = value_details data2['keyname'] = details[num_one].encode('utf-8').replace( ":", "").replace("\n", "").replace("\"", "").strip() details_list.append(data2) num_one += 1 num_two = 1 for list_intro in intro: list_intro = list_intro.encode('utf-8').replace("\n", "").replace( "\"", "").strip() list_intro = list_intro.split(':') for value_intro in list_intro: if num_two % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_intro: break if '品牌' in value_intro: break data2['attrkey'] = value_intro else: data2['keyname'] = value_intro intro_list.append(data2) num_two += 1 num_three = 1 for list_speci in speci: list_speci = list_speci.encode('utf-8').replace("\n", "").replace( "\"", "").strip() list_speci = list_speci.split(':') for value_speci in list_speci: if num_three % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_speci: break if '品牌' in value_speci: break data2['attrkey'] = value_speci else: data2['keyname'] = value_speci speci_list.append(data2) num_three += 1 product_intro = response.xpath( "//div[@class='describe htmledit']/ul/li/span/span/text()" ).extract() product_details = response.xpath( "//div[@class='describe htmledit']/p/span/text()").extract() product_speci = response.xpath( "//div[@class='describe htmledit']/p/span/strong/text()").extract( ) if len(product_speci) == 0: product_speci = response.xpath( "//div[@class='describe htmledit']/p/strong/span/text()" ).extract() product_pack = response.xpath( "//td[@id='imgDiv']/div[@id='div3']/font/b/text()").extract() filename = self.name + ".txt" file = open("data/" + filename, 'a+') file.write("\n" + "productUrl:" + response.url + "\n") file.write("productIntro:" + "\n") for intro in product_intro: intro = intro.encode('utf-8').replace("\b", "").replace( "<br/>", "").replace("<br>", "").strip() file.write(intro + "\n") file.write("productDetails:" + "\n") for details in product_details: details = details.encode('utf-8').replace("\"", "").strip() file.write(details + "\n") file.write("productSpeci:" + "\n") for speci in product_speci: speci = speci.encode('utf-8').replace("\"", "").strip() file.write(speci + "\n") file.close() item['productSpeci'] = speci_list item['productPack'] = pack_list item['productIntro'] = intro_list item['productDetails'] = details_list yield item
def parse(self, response): item = BaseItem() speci_list = [] pack_list = [] intro_list = [] details_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' print "PhantomJS is starting1..." driver = webdriver.PhantomJS() driver.get(response.url) time.sleep(3) body = driver.page_source HtmlResponses = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=response) try: item['productUrl'] = response.url except: pass try: item['productName'] = HtmlResponses.xpath( "//div[@class='panel-heading panel-heading-div']/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: classification_one = response.xpath( "//div[@class='crumbs']/span[2]/a/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() classification_two = response.xpath( "//div[@class='crumbs']/span[3]/a/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() classification_three = response.xpath( "//div[@class='crumbs']/span[4]/a/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass classification = classification_one + '|||' + classification_two + '|||' + classification_three item['productClassification'] = classification try: item['productPrice'] = HtmlResponses.xpath( "//font[@class='price-font'][2]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass try: item['productPrice'] = str( float( filter(lambda ch: ch in '0123456789.~', item['productPrice'])) * 100) except: pass try: item['productImagePath'] = HtmlResponses.xpath( "//img[@id='zoomimg']/@src").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass try: item['productAddres'] = "" except: pass try: item['productCompany'] = "" except: pass names = self.name + '.json' try: item['fileName'] = names except: pass list_speci_tr = HtmlResponses.xpath( "//div[@class='col-xs-12 nopadding border-ccc attrDiv']/div/b/text()" ).extract() list_speci_th = HtmlResponses.xpath( "//div[@class='col-xs-12 nopadding border-ccc attrDiv']/div/p/text()" ).extract() list_detail = HtmlResponses.xpath( "//div[@id='prd-desc-mdeditor']/p/text()").extract() driver.close() num_one = 0 for value_speci in list_speci_tr: data2 = {} value_speci = value_speci.encode('utf-8').replace( ":", "\/").replace("\n", "").replace("\"", "").strip() if "型号" in value_speci: item['productModel'] = list_speci_th[num_one] num_one += 1 continue if "品牌" in value_speci: item['productBrand'] = list_speci_th[num_one] num_one += 1 continue data2['attrkey'] = value_speci data2['keyname'] = list_speci_th[num_one] speci_list.append(data2) num_one += 1 #11.28 brand = '' try: brand = HtmlResponses.xpath( "//div[@class='form-group margin-left_53 margin-bottom-0'][1]/div/p/font/text()" ).extract()[0].encode('utf-8').replace("\t", "").replace( "\n", "").replace("\b", "").strip() if '/' in brand and item['productBrand'] == '': item['productBrand'] = brand.split('/')[0] else: item['productBrand'] = brand except: pass model = '' try: model = HtmlResponses.xpath( "//div[@class='form-group margin-left_53 margin-bottom-0'][1]/div/p/font/text()" ).extract()[0].encode('utf-8').replace("\t", "").replace( "\n", "").replace("\b", "").strip() if '/' in brand and item['productModel'] == '': item['productModel'] = model.split('/')[1] except: pass #11.28 detail = ' ' for value_detail in list_detail: value_detail = value_detail.encode('utf-8').replace( "\t", "").replace("\n", "").replace("\b", "").replace("<br>", "").replace("</br>", "").strip() detail += value_detail details_list.append(detail) item['productSpeci'] = speci_list item['productPack'] = pack_list item['productIntro'] = intro_list item['productDetails'] = details_list yield item
def parse(self, response): item = BaseItem() speci_list = [] pack_list = [] intro_list = [] details_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' #print "PhantomJS is starting1..." #driver = webdriver.PhantomJS() #driver.get(response.url) #time.sleep(3) #body = driver.page_source #HtmlResponses = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=response) try: item['productUrl'] = response.url except: pass try: item['productName'] = response.xpath( "//span[@id='productMainName']/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass try: classification_one = response.xpath( "//div[@class='bread-crumb']/a[2]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() classification_two = response.xpath( "//div[@class='bread-crumb']/a[3]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() classification_three = response.xpath( "//div[@class='bread-crumb']/a[4]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass classification = classification_one + '|||' + classification_two + '|||' + classification_three item['productClassification'] = classification try: item['productPrice'] = response.xpath( "//strong[@class='prodet']/b/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass try: item['productPrice'] = str( float( filter(lambda ch: ch in '0123456789.~', item['productPrice'])) * 100) except: pass try: imagePath = response.xpath("//img[@id='productImg']/@src").extract( )[0].encode('utf-8').replace("\"", "\'").strip() item['productImagePath'] = 'http://mro.abiz.com' + imagePath except: pass try: item['productAddres'] = "" except: pass try: item['productCompany'] = "" except: pass names = self.name + '.json' try: item['fileName'] = names except: pass list_speci_tr = response.xpath( "//div[@class='col-xs-12 nopadding border-ccc attrDiv']/div/b/text()" ).extract() list_speci_th = response.xpath( "//div[@class='col-xs-12 nopadding border-ccc attrDiv']/div/p/text()" ).extract() list_detail = response.xpath( "//div[@id='tbc_11']/div[@class='intro_box']").extract() logging.info("----------list_detail_len=%i" % len(list_detail)) list_intro = response.xpath( "//div[@id='tbc_13']/div[@class='intro_box']").extract() logging.info("----------list_intro_len=%i" % len(list_intro)) num_one = 0 for value_speci in list_speci_tr: data2 = {} value_speci = value_speci.encode('utf-8').replace( ":", "\/").replace("\n", "").replace("\"", "").strip() if "型号" in value_speci: item['productModel'] = list_speci_th[num_one] num_one += 1 continue if "品牌" in value_speci: item['productBrand'] = list_speci_th[num_one] num_one += 1 continue data2['attrkey'] = value_speci data2['keyname'] = list_speci_th[num_one] speci_list.append(data2) num_one += 1 #11.28 brand = '' try: brand = response.xpath( "//dl[@class='pro-info-prop pro-info-brand']/dd[@class='pro-info-cons']/text()" ).extract()[0].encode('utf-8').replace("\t", "").replace( "\n", "").replace("\b", "").replace("\r", "").strip() item['productBrand'] = brand except: pass model = '' try: model = response.xpath( "//dl[@class='pro-info-prop pro-info-model']/dd[@class='pro-info-cons']/text()" ).extract()[0].encode('utf-8').replace("\t", "").replace( "\n", "").replace("\b", "").strip() item['productModel'] = model except: pass #11.28 for value_detail in list_detail: value_detail = value_detail.encode('utf-8').replace( "\t", "").replace("\n", "").replace("\b", "").replace( "<br>", "").replace("</br>", "").replace("\r", "").strip() dr = re.compile(r'<[^>]+>', re.S) dd_value_detail = dr.sub('', value_detail) details_list.append(dd_value_detail) cancel = '' try: cancel_l = response.xpath( "//p[@class='link-detail']/text()").extract() for cancel_s in cancel_l: cancel_s = cancel_s.encode('utf-8').replace("\t", "").replace( "\n", "").replace("\b", "").replace("<br>", "").replace( "</br>", "").replace("\r", "").strip() cancel += cancel_s except: pass for value_intro in list_intro: value_intro = value_intro.encode('utf-8').replace( "\t", "").replace("\n", "").replace("\b", "").replace( "<br>", "").replace("</br>", "").replace("\r", "").strip() value_intro.replace(cancel, '') dr = re.compile(r'<[^>]+>', re.S) dd_value_intro = dr.sub('', value_intro) intro_list.append(dd_value_intro) item['productSpeci'] = speci_list item['productPack'] = pack_list item['productIntro'] = intro_list item['productDetails'] = details_list yield item
def parse(self, response): item = BaseItem() speci_list = [] pack_list = [] intro_list = [] details_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' try: classification_one = response.xpath( "//div[@class='subNav']/a[3]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() classification_two = response.xpath( "//div[@class='subNav']/a[4]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() classification_three = response.xpath( "//div[@class='subNav']/a[5]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass classification = classification_one + '|||' + classification_two + '|||' + classification_three try: item['productUrl'] = response.url except: pass try: item['productName'] = response.xpath( "//div[@class='proDiv']/dl[@class='proDl']/dt/b/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: item['productBrand'] = response.xpath( "//div[@class='proDiv']/dl[@class='proDl']/dd[3]/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() item['productBrand'] = item['productBrand'].replace('产品品牌:', '') except: pass try: item['productModel'] = response.xpath( "//div[@class='proDiv']/dl[@class='proDl']/dd[2]/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() item['productModel'] = item['productModel'].replace('原始型号:', '') except: pass try: item['productClassification'] = classification except: pass try: #去空格 转分 去人民币符号 item['productPrice'] = response.xpath( "//div[@class='proDiv']/dl[@class='proDl']/dd[4]/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() item['productPrice'] = item['productPrice'].replace('价格:', '') if item['productPrice'] == '询价': item['productPrice'] = 0.0 except: pass try: item['productPrice'] = str( float( filter(lambda ch: ch in '0123456789.~', item['productPrice'])) * 100) except: pass #图片连接 try: item[ 'productImagePath'] = "http://www.btone-mro.com" + response.xpath( "//img[@id='ctl00_ContentPlaceHolder1_imgMain']/@src" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass #print item['image_urls'],"777777" try: item['productAddres'] = response.xpath( "//form[@id='form1']/ul/li[4]/text()").extract()[0] except: pass try: item['productCompany'] = "" except: pass names = self.name + '.json' try: item['fileName'] = names except: pass list_details = response.xpath( "//div[@id='para']/table[1]/tbody[2]/tr/td/text()").extract() logging.info("-------list_details_len=%i" % len(list_details)) list_pack = response.xpath( "//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()" ).extract() intro = response.xpath("//span[@id='PDescription']/text()").extract() logging.info("-------intr_len=%i" % len(intro)) speci = response.xpath("//span[@id='techParam']/text()").extract() logging.info("-------intr_len=%i" % len(speci)) num_one = 1 for value_details in list_details: value_details = value_details.encode('utf-8').replace( ":", "\/").replace("\n", "").replace("\"", "").strip() if num_one % 2 == 2: num_one = 1 continue if num_one % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '品牌' in value_details: num_one = 0 continue if '型号' in value_details: num_one = 0 continue data2['attrkey'] = value_details else: if num_one == 0: num_one = 1 continue data2['keyname'] = value_details details_list.append(data2) num_one += 1 num_two = 1 for list_intro in intro: list_intro = list_intro.encode('utf-8').replace("\n", "").replace( "\"", "").strip() list_intro = list_intro.split(':') for value_intro in list_intro: if num_two % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_intro: break if '品牌' in value_intro: break data2['attrkey'] = value_intro else: data2['keyname'] = value_intro intro_list.append(data2) num_two += 1 num_three = 1 for list_speci in speci: list_speci = list_speci.encode('utf-8').replace("\n", "").replace( "\"", "").strip() list_speci = list_speci.split(':') for value_speci in list_speci: if num_three % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_speci: break if '品牌' in value_speci: break data2['attrkey'] = value_speci else: data2['keyname'] = value_speci speci_list.append(data2) num_three += 1 product_intro = response.xpath( "//div[@class='proNavInfo proNav1']/p[2]/text()").extract() product_pack = response.xpath( "//div[@class='proNavInfo proNav3']/p/text()").extract() product_speci = response.xpath( "//div[@class='proNavInfo proNav2']/p/text()").extract() filename = self.name + ".txt" file = open("data/" + filename, 'a+') file.write("\n" + "productUrl:" + response.url + "\n") file.write("productIntro:" + "\n") for intro in product_intro: intro = intro.encode('utf-8').replace("\b", "").replace( "<br/>", "").replace("<br>", "").strip() file.write(intro + "\n") file.write("productPack:" + "\n") for pack in product_pack: pack = pack.encode('utf-8').replace("\"", "").strip() file.write(pack + "\n") file.write("productSpeci:" + "\n") for speci in product_speci: speci = speci.encode('utf-8').replace("\"", "").strip() file.write(speci + "\n") file.close() item['productSpeci'] = speci_list item['productPack'] = pack_list item['productIntro'] = intro_list item['productDetails'] = intro_list yield item
def parse(self, response): item = BaseItem() speci_list = [] pack_list = [] intro_list = [] details_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' try: item['productUrl'] = response.url except: pass try: item['productName'] = response.xpath( "//div[@class='center_title']/a/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass try: item['productBrand'] = response.xpath( "//form[@id='ECS_FORMBUY_P']/div[@class='detail_center']/div[@class='center_txt']/div[@class='center_text']/p[1]/a/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: item['productModel'] = response.xpath( '//div[@class="m m1"]/div/ul/dt/li/text()').extract( )[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: classification_one = response.xpath( "//div[@class='location']/a[2]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() classification_two = response.xpath( "//div[@class='location']/a[3]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() classification_three = response.xpath( "//div[@class='location']/a[4]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass classification = classification_one + '|||' + classification_two + '|||' + classification_three item['productClassification'] = classification try: item['productPrice'] = response.xpath( "//div[@class='center_text']/ul[@class='tm-fcs-panel']/li[1]/span[@id='ECS_GOODS_AMOUNT']/span/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: item['productPrice'] = str( float( filter(lambda ch: ch in '0123456789.~', item['productPrice'])) * 100) except: pass try: item[ 'productImagePath'] = "http://www.1ez.com.cn/" + response.xpath( '//img[@id="J_prodImg"]/@src').extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass try: item['productAddres'] = "" except: pass try: item['productCompany'] = "" except: pass names = self.name + '.json' try: item['fileName'] = names except: pass list_details = response.xpath( "//ul[@class='inLeft_attributes']/li/text()").extract() details = response.xpath( "//ul[@class='inLeft_attributes']/li/span/text()").extract() logging.info("-------list_details_len=%i" % len(list_details)) logging.info("-------details_len=%i" % len(details)) list_pack = response.xpath( "//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()" ).extract() intro = response.xpath("//span[@id='PDescription']/text()").extract() logging.info("-------intr_len=%i" % len(intro)) speci = response.xpath("//span[@id='techParam']/text()").extract() logging.info("-------intr_len=%i" % len(speci)) num_one = 0 value_details = '' for j in range(1, len(list_details)): try: value_details = response.xpath( "//ul[@class='inLeft_attributes']/li[%i]/text()" % j).extract()[0].encode('utf-8').replace(":", "").replace( "\"", "").replace(":", "").strip() except: break if '品牌' in value_details: num_one += 1 continue if '商品名称' in value_details: num_one += 1 continue if '型号' in value_details: num_one += 1 continue else: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' data2['attrkey'] = value_details data2['keyname'] = details[num_one] details_list.append(data2) num_one += 1 num_two = 1 for list_intro in intro: list_intro = list_intro.encode('utf-8').replace("\n", "").replace( "\"", "").strip() list_intro = list_intro.split(':') for value_intro in list_intro: if num_two % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_intro: break if '品牌' in value_intro: break data2['attrkey'] = value_intro else: data2['keyname'] = value_intro intro_list.append(data2) num_two += 1 num_three = 1 for list_speci in speci: list_speci = list_speci.encode('utf-8').replace("\n", "").replace( "\"", "").strip() list_speci = list_speci.split(':') for value_speci in list_speci: if num_three % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_speci: break if '品牌' in value_speci: break data2['attrkey'] = value_speci else: data2['keyname'] = value_speci speci_list.append(data2) num_three += 1 item['productSpeci'] = speci_list item['productPack'] = pack_list item['productIntro'] = intro_list item['productDetails'] = details_list yield item
def parse(self, response): item = BaseItem() speci_list = [] pack_list = [] intro_list = [] details_list = [] item['productUrl'] = '' item['productName'] = '' item['productBrand'] = '' item['productModel'] = '' item['productClassification'] = '' item['productPrice'] = '' item['productImagePath'] = '' item['productAddres'] = "" item['productCompany'] = '' item['fileName'] = '' item['productDetails'] = "" item['productPack'] = "" item['productIntro'] = "" item['productSpeci'] = "" classification_one = '' classification_two = '' classification_three = '' try: classification_one = response.xpath( "//div[@class='siteUrl']/a[2]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() classification_two = response.xpath( "//div[@class='siteUrl']/a[3]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() classification_three = response.xpath( "//div[@class='siteUrl']/a[4]/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass classification = classification_one + '|||' + classification_two + '|||' + classification_three print "PhantomJS is starting1..." driver = webdriver.PhantomJS() driver.get(response.url) #time.sleep(3) body = driver.page_source #driver.close() HtmlResponses = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=response) try: item['productUrl'] = response.url except: pass try: item['productName'] = response.xpath( "//div[@class='hd']/div/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass try: item['productBrand'] = response.xpath( "//div[@class='dd']/em/a/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass try: item['productModel'] = response.xpath( "//form[@id='ECS_FORMBUY']/div[@class='proInfo f_R']/div[@class='bd']/ul/li[2]/div[@class='dd']/em/text()" ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass try: item['productClassification'] = classification except: pass try: #去空格 转分 去人民币符号 item['productPrice'] = HtmlResponses.xpath( "//b[@id='ECS_GOODS_AMOUNT']/text()").extract()[0].encode( 'utf-8').replace("\"", "\'").strip() except: pass try: item['productPrice'] = str( float( filter(lambda ch: ch in '0123456789.~', item['productPrice'])) * 100) except: pass #图片连接 try: item['productImagePath'] = "http://www.huaaomro.com/" + HtmlResponses.xpath( '//div[@class="proSide f_L"]/div[@class="bd"]/img[@id="idImage2"]/@src' ).extract()[0].encode('utf-8').replace("\"", "\'").strip() except: pass #print item['image_urls'],"777777" try: item['productAddres'] = response.xpath( "//div[@id='pdetail']/div[@class='d-vopy']/table/tbody/tr[4]/td/text()" ).extract()[0] except: pass try: item['productCompany'] = "" except: pass names = self.name + '.json' try: item['fileName'] = names except: pass item['productAddres'] = "" item['productCompany'] = "" names = self.name + '.json' item['fileName'] = names list_details = response.xpath( "//div[@class='d-vopy']/table/tr/th/h4/text()").extract() logging.info("-------list_details_len=%i" % len(list_details)) details = response.xpath( "//div[@class='d-vopy']/table/tr/td/text()").extract() logging.info("-------details_len=%i" % len(details)) list_pack = response.xpath( "//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()" ).extract() list_intro = response.xpath( "//ul[@class='detail-list clearfix']/li/text()").extract() logging.info("-------list_intro_len=%i" % len(list_intro)) intro = response.xpath( "//div[@class='goods']/table[@class='goods-items']/tr[2]/td/text()" ).extract() logging.info("-------intr_len=%i" % len(intro)) speci = response.xpath("//span[@id='techParam']/text()").extract() logging.info("-------intr_len=%i" % len(speci)) num_one = 0 for list_details_value in list_details: list_details_value = list_details_value.encode('utf-8').replace( "\n", "").replace("\"", "").strip() data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '品牌' in list_details_value: num_one += 1 continue if '价格' in list_details_value: num_one += 1 continue if '供应商' in list_details_value: num_one = 0 continue if '保修期' in list_details_value: break data2['attrkey'] = list_details_value data2['keyname'] = details[num_one] details_list.append(data2) num_one += 1 for list_intro_value in list_intro: list_intro_value = list_intro_value.encode('utf-8').replace( "\n", "").strip() intro = list_intro_value.split(':') data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品品牌' in intro[0]: continue if '商品型号' in intro[0]: continue if '商品名称' in intro[0]: continue data2['attrkey'] = intro[0] data2['keyname'] = intro[1] intro_list.append(data2) num_three = 1 for list_speci in speci: list_speci = list_speci.encode('utf-8').replace("\n", "").replace( "\"", "").strip() list_speci = list_speci.split(':') for value_speci in list_speci: if num_three % 2 == 1: data2 = {} data2['attrkey'] = '' data2['keyname'] = '' if '商品名称' in value_speci: break if '品牌' in value_speci: break data2['attrkey'] = value_speci else: data2['keyname'] = value_speci speci_list.append(data2) num_three += 1 item['productSpeci'] = speci_list item['productPack'] = pack_list item['productIntro'] = intro_list item['productDetails'] = details_list yield item