def parse(self, response): response = str(response).split(" ")[1].replace(">", "") bro = self.login(response) # print(response.text) num = 0 for i in range(2): url = "https://s.taobao.com/search?q=java&s=" + str(num) num += 44 bro.get(url) html = bro.page_source soup = BeautifulSoup(html, 'lxml') data_list = soup.find_all(class_='item J_MouserOnverReq') for data in data_list: data_soup = BeautifulSoup(str(data), 'lxml') # 图片链接 img_url = "http:" + data_soup.find( class_='J_ItemPic img')['data-src'] # 图片价格 price = data_soup.find('strong').string # 图片标题 title = data_soup.find(class_='J_ItemPic img')['alt'] # 详情页 detail_url = "https:" + data_soup.find( class_="pic-link J_ClickStat J_ItemPicA")["data-href"] bro.get(detail_url) time.sleep(1) html_second = bro.page_source soup = BeautifulSoup(html_second, 'lxml') try: svolume = soup.find( class_="tm-ind-item tm-ind-sellCount").text.replace( "月销量", "") except: svolume = 0 try: evaluate = soup.find( class_="tm-ind-item tm-ind-reviewCount canClick tm-line3" ).text.replace("累计评价", "") except: evaluate = 0 try: integral = soup.find( class_="tm-ind-item tm-ind-emPointCount").text.replace( "送天猫积分", "") except: integral = 0 item = TaobaoItem(img_url=img_url, price=price, title=title, svolume=svolume, evaluate=evaluate, integral=integral, detail_url=detail_url) yield item
def loop_rate_parse(self, response): body = json.loads(re.findall('\((.*)\)', response.text)[0]) items = TaobaoItem() for item in body['comments']: items['id'] = item['rateId'] items['url'] = response.url items['platform'] = '淘宝' items['viewType'] = '问答' items['searchWord'] = response.meta['sw'] items['crawlTime'] = self.get_localtime() items['publishTime'] = item['date'] items['level'] = 1 items['authorName'] = item['user']['nick'] items['content'] = item['content'] print(items)
def parse_item(self, response): #time.sleep(1) pattern = re.compile('g_page_config = ({.*?});', re.S) # 匹配出json型的数据,用正则。 json_data = re.search(pattern, response.text).group(1) json_l = json.loads(json_data) page = re.findall(r'\"totalPage\"\:\d+', response.text) totalPage = eval(page[0].split(':')[1]) #print('+++++++++++++++++++++++++++++++++++++',totalPage) #print(json_l) for datas in json_l.get('mods').get('itemlist').get('data').get( 'auctions'): item = TaobaoItem() item['item_loc'] = datas.get('item_loc') item['pic_url'] = 'https:' + datas.get('pic_url') item['raw_title'] = datas.get('raw_title') item['shop_link'] = datas.get('shop_Link') item['view_price'] = datas.get('view_price') item['view_sales'] = datas.get('view_sales') file_names = [ 'item_loc', 'pic_url', 'raw_title', 'view_price', 'view_sales', 'shop_link' ] for name in file_names: if item[name] == None: item[name] = 'there is no item data' yield item for i in range(totalPage - 2): pager = json_l.get('mainInfo').get('modLinks').get('pager') print('爬取一页结束') # 调试信息 # 这里获得了下一页的基本链接,还有加上时间戳 times = self.get_time_stamp() other_data = { 'data-key': 's', 'data-value': str(response.meta['data_value']), '_ksTS': times[0] } next_url = 'https:' + pager + '&' + parse.urlencode(other_data) #print('******************',next_url) # 这里获得第二页的基本链接,要加上时间戳 data_values = response.meta['data_value'] + 44 yield Request(url=next_url, meta={'data_value': data_values}, cookies=self.cookie, callback=self.parse_item)
def parse(self, response): #time.sleep(1) pattern = re.compile('g_page_config = ({.*?});', re.S) # 匹配出json型的数据,用正则。 json_data = re.search(pattern, response.text).group(1) json_l = json.loads(json_data) item = TaobaoItem() for datas in json_l.get('mods').get('itemlist').get('data').get( 'auctions'): item['item_loc'] = datas.get('item_loc') item['pic_url'] = 'https:' + datas.get('pic_url') item['raw_title'] = datas.get('raw_title') item['shop_link'] = 'https:' + datas.get('shopLink') item['view_price'] = datas.get('view_price') item['view_sales'] = datas.get('view_sales') shop_link = datas.get('shopLink') #pages = datas.get('totalPage') file_names = [ 'item_loc', 'pic_url', 'raw_title', 'view_price', 'view_sales', 'shop_link' ] for name in file_names: if item[name] == None: item[name] = 'there is no item data' yield item #print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ '+'https:',shop_link) next_partial_url = json_l.get('mainInfo').get('modLinks').get('pager') # print('next_partial_url:','https:'+next_partial_url) times = self.get_time_stamp() data_value = 44 other_data = { 'data-key': 's', 'data-value': str(data_value), '_ksTS': times[0], } # print('other_date:',parse.urlencode(other_data)) next_url = 'https:' + next_partial_url + '&' + parse.urlencode( other_data) # print('-------------------',next_url) # 这里获得第二页的基本链接,要加上时间戳 data_values = data_value + 44 yield Request(url=next_url, meta={'data_value': data_values}, cookies=self.cookie, callback=self.parse_item)
def parse(self, response): # 获取12种热门种类 shop_type_name = re.findall(r'{"name":"(\S+?)"', response.text) # 创建对象 item = TaobaoItem() llist = [n * 6 for n in range(12)] i = 0 for n in range(len(shop_type_name)): if i in llist: item['商店种类'] = shop_type_name[n] url = 'https://shopsearch.taobao.com/search?data-key=s&data-value=0&ajax=true&_ksTS=1511058139704_756&app=shopsearch&spm=a230r.7195193.0.0.9l0Tsy&q=%s' % ( item['商店种类']) time.sleep(0.01) yield scrapy.Request(url, callback=self.parse_shop_type_detail, meta={'meta_1': copy.deepcopy(item)}) time.sleep(1) i += 1
def next(self, response): item = TaobaoItem() url = response.url pattam_url = 'https://(.*?).com' subdomain = re.compile(pattam_url).findall(url) #print(subdomain) if subdomain[0] != 'item.taobao': #天猫!!! title = response.xpath("//div[@class='tb-detail-hd']/h1/text()").extract()[0] pattam_price = '"defaultItemPrice":"(.*?)"' price = re.compile(pattam_price).findall(response.body.decode('utf-8', 'ignore')) # 天猫 pattam_id = 'id=(.*?)&' itemID = re.compile(pattam_id).findall(url)[0] # salesVolume = response.xpath('//div[@class="tb-sell-counter"]/a/strong/text()').extract()[0] itemInfo = response.xpath('//div[@class="attributes"]/div/ul/li/text()').extract() else: #淘宝!!! title = response.xpath("//h3[@class='tb-main-title']/@data-title").extract()[0] price = response.xpath("//em[@class = 'tb-rmb-num']/text()").extract()[0] # 淘宝 pattam_id = 'id=(.*?)$' itemID = re.compile(pattam_id).findall(url)[0] # salesVolume = response.xpath('//div[@class="tb-sell-counter"]/a/@title').extract() itemInfo = response.xpath('//div[@class="attributes"]/ul/li/text()').extract() #评论数太麻烦了,我有直接抓评论的方法,不理这个了 # # 构造具有评论数量信息的包的网址 # comment_url = 'https://dsr-rate.tmall.com/list_dsr_info.htm?itemId=' + str(itemID) # # 这个获取网址源代码的代码永远也不会出现错误,因为这个URL的问题,就算URL是错误的,也可以获取到对应错误网址的源代码。 # # 所以不需要使用 try 和 except urllib.URLError as e 来包装。 # comment_data = urllib.request.urlopen(comment_url).read().decode('utf-8', 'ignore') # pattam_comment = '"rateTotal":(.*?),"' # comment = re.compile(pattam_comment).findall(comment_data) item['title'] = title item['itemLink'] = response.url item['price'] = price item['itemID'] = itemID # item['salesVolume'] = salesVolume item['itemInfo'] = itemInfo yield item # # 名称 # title = scrapy.Field() # # 价格 # price = scrapy.Field() # # 月销量 # salesVolume = scrapy.Field() # # 评论数 # comment = scrapy.Field() # # 宝贝详情 # itemInfo = scrapy.Field() # # 链接 # itemLink = scrapy.Field() # # 宝贝ID # itemID = scrapy.Field()