def parse_yhd_item(self, reponse): """解析tmall Item""" data = reponse.body soup = BeautifulSoup(data, "html5lib") item = reponse.meta['item'] title_tag = soup.find('h1', id="productMainName") item['title'] = title_tag.text is_proprietary_trading = False source_tag = soup.find('p', attrs={'class': 'add_02'}) # print 'sssssssssssource_tag.string %s' % source_tag.string # print 'ttttttttttsource_tag.text %s' % source_tag.text if source_tag.text.strip().startswith(u'本商品由1号店自营提供'): # print 'sssssssssssssssssssssss本商品由1号店自营提供' is_proprietary_trading = True else: pass if is_proprietary_trading: ul_tag = soup.find('ul', attrs={'class': 'ull'}) brand_name = Parse_Util.get_parse_value(ul_tag, u'【产品品牌】:') if brand_name == 'None': brand_name = Parse_Util.get_parse_value(ul_tag, u'【品牌名称】:') if brand_name == 'None': dl_tag = soup.find('dl', attrs={'class': 'des_info clearfix'}) brand_name = self.get_brand(dl_tag) product_name = Parse_Util.get_parse_value(ul_tag, u'【产品名称】:') if product_name == 'None': product_name = Parse_Util.get_parse_value(ul_tag, u'【商品名称】:') if product_name == 'None': product_name = Parse_Util.get_parse_value(ul_tag, u'【名称】:') item['brand'] = brand_name item['product_name'] = product_name item['source'] = 1 else: good_tag = soup.find('dl', class_="des_info clearfix") item['brand'] = self.get_brand(good_tag) item['source'] = 0 yield item
def parse_kaola_item(self, reponse): """解析Kaola Item""" data = reponse.body soup = BeautifulSoup(data, "html5lib") item = reponse.meta['item'] kaola_item = KaoLaMMItem() title_tag = soup.find('dt', class_="product-title") kaola_item['title'] = title_tag.text goods_tag = soup.find('ul', class_='goods_parameter') kaola_item['brand'] = Parse_Util.get_parse_value(goods_tag, u'商品品牌:') kaola_item['product_name'] = Parse_Util.get_parse_value( goods_tag, u'品名:') kaola_item['good_detail'] = Parse_Util.make_up_dic(goods_tag) item['other_parameter'] = kaola_item yield item
def parse_word_wide_item(self, response): """解析全球购ITEM""" data = response.body soup = BeautifulSoup(data, "html5lib") item = response.meta['item'] item_id = response.meta['id'] title_tag = soup.find('div', id="name") jd_item = JDMMItem() jd_item['title'] = self.delete_node_content(title_tag, 'span') good_tag = soup.find('ul', id="parameter2") jd_item['product_name'] = Parse_Util.get_parse_value( good_tag, u'商品名称:') jd_item['brand'] = Parse_Util.get_parse_value(good_tag, u'品牌:') jd_item['good_detail'] = Parse_Util.make_up_dic(good_tag) item['other_parameter'] = jd_item item_comment_link = comment_origin_url % (int(item_id)) yield Request(item_comment_link, callback=self.parse_comment_detail, meta={'item': item})
def parse_jd_item(self, response): """解析普通jd Item""" data = response.body soup = BeautifulSoup(data, "html5lib") item = response.meta['item'] item_id = response.meta['id'] title_tag = soup.find('div', id="name") title = 'error' for child in title_tag.children: if child is None: continue if child.name is None: continue if child.name == u"h1": title = child.string break jd_item = JDMMItem() jd_item['title'] = title.encode('utf-8') good_tag = soup.find('ul', attrs={'id': 'parameter2'}) jd_item['product_name'] = Parse_Util.get_parse_value( good_tag, u'商品名称:') jd_item['good_detail'] = Parse_Util.make_up_dic(good_tag) ul_tag = soup.find('ul', id="parameter-brand") # print 'ul_tag -------------- %s' % ul_tag jd_item['brand'] = 'None' if ul_tag != None: jd_item['brand'] = ul_tag.find('li').get("title") li_tags = ul_tag.find_all('li') li_tag = li_tags[0] p = re.compile('\s+') brand_str = re.sub(p, '', li_tag.text) if string.find(brand_str, u'♥') != -1: list_str = brand_str.split(u'♥') brand_str = list_str[0] brand_str_list = brand_str.split(u':') # print 'brand_str_list --------- %s' % brand_str_list jd_item['good_detail'][brand_str_list[0]] = brand_str_list[1] item['other_parameter'] = jd_item item_comment_link = comment_origin_url % (int(item_id)) yield Request(item_comment_link, callback=self.parse_comment_detail, meta={'item': item})