def parse(self, response): XiaohuaSpider.count += 1 pagenum = XiaohuaSpider.count #有输入并且当前页数超过页数就停止 if XiaohuaSpider.maxpage and pagenum > int(XiaohuaSpider.maxpage): print('超过页数,停止') return print('performing page %d' % pagenum) item = TiebaItem() #实例化item page_item = dict() #记录页数 page_item['page_num'] = pagenum yield page_item #返回到pipline写出当前页码 #将注释符号去掉,并替换response中的body, byte类型 newbody = bytes(response.text.replace("<!--", "").replace("-->", ""), encoding='utf-8') # 提取数据 newresponse = response.replace(body=newbody) li_list = newresponse.xpath('//li[@class=" j_thread_list clearfix"]') #print(len(li_list)) for el in li_list: item['title'] = el.xpath(".//a/text()").get() #每个结果都是列表 item['link'] = newresponse.urljoin(el.xpath(".//a/@href").get()) yield item #下一页 next_part_url = newresponse.xpath( '//*[@id="frs_list_pager"]/a[contains(text(),"下一页>")]/@href').get( ) # 只要还有下一页就进行请求 if next_part_url != None: next_url = 'https:' + next_part_url yield scrapy.Request(url=next_url, callback=self.parse)
def the_tiezi(self,one_thread): '''输入的是贴吧首页的每条帖子标题的原始信息, 返回标题、发帖人等信息组成的 dict''' tiezi = TiebaItem() data = json.loads(one_thread.xpath("@data-field").extract_first()) # 相当于大纲吧 title = one_thread.xpath('.//a[@class="j_th_tit "]/text()').extract_first().strip() #标题 author = data['author_name'] #发帖人 tid = data['id'] #帖子的tid reply_num = int(data['reply_num']) #帖子的回复数量 last_reply_time = one_thread.xpath( './/span[@class="threadlist_reply_date pull_right j_reply_data"]/text()').extract_first() if last_reply_time is not None: #帖子最后回复时间(顶置的帖子没有) last_reply_time = last_reply_time.strip() if re.match(r'\d+:\d+', last_reply_time): # 最后回复时间,以前的只会显示日期;今天的只显示时分,得加上日期 last_reply_time = time.strftime("%Y-%m-%d ", time.localtime()) + last_reply_time #最后回复人,顶置帖子好像没有? last_reply_author=one_thread.xpath('.//span[@class="tb_icon_author_rely j_replyer"]/@title').re_first(r'最后回复人: \s*(.*)') tiezi['title'] = title tiezi['author'] = author tiezi['tid'] = tid tiezi['pages']=None tiezi['reply_num'] = int(reply_num) tiezi['last_reply_author'] = last_reply_author tiezi['last_reply_time'] = last_reply_time tiezi['post_list'] = [] #所以楼层信息组成的list return dict(tiezi)
def parse_post_note(self, response): json_obj = json.loads(response.body) no = int(json_obj['no']) if no == 0: utils.debug('发帖成功:', json.dumps(json_obj['data'])) tid = json_obj['data']['tid'] meta = response.meta['datas'] data = dict() data['fid'] = meta['fid'] data['id'] = tid data['kw'] = meta['kw'] data['tbs'] = meta['tbs'] data['title'] = meta['title'] data['content'] = meta['content'] data['timestamp'] = utils.timestamp item = TiebaItem(type=1) item['note'] = data yield item time.sleep(2) yield self.post_reply(tid) else: err_code = int(json_obj['err_code']) utils.debug('发帖失败:', get_post_err_msg(no, err_code, response.body)) if no == 40: vcode_obj = json_obj['data']['vcode'] input_captcha = utils.show_captcha(vcode_obj['captcha_vcode_str']) captcha_type = vcode_obj['captcha_code_type'] yield self.__check_captcha(captcha=input_captcha, captcha_type=captcha_type) yield self.post_note(input_captcha)
def parse(self, response): self.i = 1 item = TiebaItem() list = response.xpath('//*[@id="thread_list"]/li') for i in list: item["tittle"] = i.xpath( './/a[@rel="noreferrer"]/@title').extract_first() item["url"] = i.xpath( './/a[@rel="noreferrer"]/@href').extract_first() item["content"] = i.xpath( './/div[contains(@class,"threadlist_abs threadlist_abs_onlyline ")]/text()' ).extract_first() if item["url"] is not 'javascript:;': item["url"] = "https://tieba.baidu.com" + ( i.xpath('.//a[@rel="noreferrer"]/@href').extract_first()) yield scrapy.Request(item["url"], callback=self.parse_detail, meta={"item": copy.deepcopy(item)}) print(item["url"]) next_url = "https:" + response.xpath( "//a[contains(@class,'next pagination-item')]/@href" ).extract_first() print(type(next_url)) print(next_url) if next_url is not None: yield scrapy.Request(next_url, callback=self.parse)
def parse_my_forums(self, response): text = re.search(r'<table>(.*?)</table>', response.text).group(1) selector = Selector(text=text) trs = selector.xpath('//tr') if trs is not None and len(trs) > 1: trs = trs[1:] for tr in trs: if len(tr.xpath('td')) == 0: continue forum = dict() forum['title'] = tr.xpath('td[1]/a/@title').extract()[0] forum['href'] = tr.xpath('td[1]/a/@href').extract()[0] forum['exper'] = tr.xpath('td[2]/a/text()').extract()[0] forum['fid'] = tr.xpath('td[4]/span/@balvid').extract()[0] forum['tbs'] = tr.xpath('td[4]/span/@tbs').extract()[0] self.forums.append(forum) item = TiebaItem(type=0) item['forum'] = forum yield item self.pn += 1 yield self.__get_my_forums(pn=self.pn) else: index = 0 for forum in self.forums: if index > 0: sleep(2) index += 1 yield self.__single_signin(forum['tbs'], forum['title'])
def parse(self, response): item = TiebaItem() selector = Selector(response) infos = selector.xpath('//div[@class="zu-top-feed-list"]/div') for info in infos: try: question = info.xpath( 'div/div/h2/a/text()').extract()[0].strip() favour = info.xpath( 'div/div/div[1]/div[1]/a/text()').extract()[0] user = info.xpath( 'div/div/div[1]/div[3]/span/span[1]/a/text()').extract()[0] user_info = info.xpath( 'div/div/div[1]/div[3]/span/span[2]/text()').extract( )[0].strip() content = info.xpath( 'div/div/div[1]/div[5]/div/text()').extract()[0].strip() item['question'] = question item['favour'] = favour item['user'] = user item['user_info'] = user_info item['content'] = content yield item except IndexError: pass urls = [ 'https://www.zhihu.com/topic/19552832/top-answers?page={}'.format( str(i)) for i in range(2, 50) ] for url in urls: yield Request(url, callback=self.parse)
def parse(self, response): #定义parse()函数 item = TiebaItem() #实例化类 selector = Selector(response) infos = selector.xpath('//*[@id="TopicMain"]/div[2]/div/div') print(infos) for info in infos: try: question = info.xpath( 'div/div/h2/div/a/text()').extract()[0].strip() # favour = info.xpath('div/div/div[1]/div[1]/a/text()').extract()[0] # user = info.xpath('div/div/div[1]/div[3]/span/span[1]/a/text()').extract()[0] # user_info = info.xpath('div/div/div[1]/div[3]/span/span[2]/text()').extract()[0].strip() # content = info.xpath('div/div/div[1]/div[5]/div/text()').extract()[0].strip() item['question'] = question # item['favour'] = favour # item['user'] = user # item['content']= content yield item #返回爬虫数据 except IndexError: pass #pass 掉 IndexError错误 urls = [ 'https://www.zhihu.com/topic/19552832/top-answers?page={}'.format( str(i)) for i in range(2, 50) ] for url in urls: yield Request(url, callback=self.parse) #回调函数
def getInfo(self, response): print("存入信息") li_list = response.xpath( '//*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div' ) for li in li_list: item = TiebaItem() item['reply_num'] = li.xpath( 'div["col2_left j_threadlist_li_left"]/span[@title="回复"]/text()' ).extract_first() item['theme'] = li.xpath( 'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/@title' ).extract_first() item['theme_site'] = li.xpath( 'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/@href' ).extract_first() item['theme_author'] = li.xpath( 'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="tb_icon_author "]/@title' ).extract_first() item['create_time'] = li.xpath( 'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="pull-right is_show_create_time"]/text()' ).extract_first() item['content'] = li.xpath( 'div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div["@class=threadlist_text pull_left"]/div/text()' ).extract_first() item['replyer'] = li.xpath( 'div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="tb_icon_author_rely j_replyer"]/@title' ).extract_first() item['reply_date'] = li.xpath( 'div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="threadlist_reply_date pull_right j_reply_data"]/text()' ).extract_first() yield item
def parse(self, response): for items in response.css( 'div.l_post.l_post_bright.j_l_post.clearfix'): var = TiebaItem() x = items.css( 'div.d_author>ul.p_author>li.d_name>a::text').extract() var['author'] = " ".join(x) var['image_urls'] = items.css( 'div.d_post_content_main>div.p_content>cc>div>img::attr(src)' ).extract() var['images_name'] = [] count = 0 for i in var['image_urls']: count = count + 1 var['images_name'].append(var['author'] + '_20FebI' + str(count)) if not var['image_urls']: continue yield var next_url = response.css( ' div.l_thread_info > ul > li.l_pager.pager_theme_5.pb_list_pager > a:nth-child(7)::attr(href)' ).extract_first() if next_url: next_url = response.urljoin(next_url) yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): item = TiebaItem() for box in response.xpath( '//div[@class="threadlist_title pull_left j_th_tit "]/a[@class="j_th_tit "]' ): item['url'] = 'http://tieba.baidu.com' + box.xpath( './/@href').extract()[0] item['title'] = box.xpath('.//@title').extract()[0].strip() yield item
def parse(self, response): #百度贴吧全是注释形式的html,所以response不能直接用css选择器,crawlspider也无法使用 #提取出所有注释的html并且联结成一个html html_str_list = re.findall(r'<!--(.*?)-->', response.text, re.S) html_str = ''.join(html_str_list) bs = BeautifulSoup(html_str, 'html.parser') all_article = bs.select('[class="t_con cleafix"]') for article in all_article: item = TiebaItem() bs_article = BeautifulSoup(str(article), 'html.parser') title = re.findall( r'<a.*?>(.*?)</a>', str(bs_article.select('[class="j_th_tit"]')[0]))[0] detail_url = 'https://tieba.baidu.com' + re.findall( r'href="(.*?)"', str( bs_article.select('[class="j_th_tit"]')[0]))[0] #有些vip用户的class是不一样的 try: auther = re.findall( r'title="主题作者: (.*?)"', str(bs_article.select('[class="tb_icon_author"]')[0]))[0] except: auther = re.findall( r'title="主题作者: (.*?)"', str( bs_article.select( '[class="tb_icon_author no_icon_author"]')[0]))[0] item['title'] = title item['auther'] = auther item['detail_url'] = detail_url #获取id,可以通过id和评论外键关联 article_id = int(detail_url.split('/')[-1]) item['article_id'] = article_id #是否是置顶帖 if bs_article.select('[class="icon-top"]') != []: item['is_top'] = 1 else: item['is_top'] = 0 item_copy = copy.deepcopy(item) yield scrapy.Request(item_copy['detail_url'], callback=self.parse_detail, meta={'item': item_copy}) next_page_url = 'https:' + re.findall( r'href="(.*?)"', str( bs.select('[class="next pagination-item"]')[0]))[0].replace( 'amp;', '') try: #若无法获取,则说明是最后一页 last_page_url = 'https:' + re.findall( r'href="(.*?)"', str(bs.select('[class="last pagination-item"]') [0]))[0].replace('amp;', '') yield scrapy.Request(next_page_url, callback=self.parse) except: return
def parse(self, response): item = TiebaItem() item['title'] = response.xpath( '//*[@id="thread_list"]/li/div/div[2]/div[1]/div[1]/a/text()' ).extract() item['author'] = response.xpath( '//*[@id="thread_list"]/li/div/div[2]/div[1]/div[2]/span[1]/span[1]/a/text()' ).extract() item['reply'] = response.xpath( '//*[@id="thread_list"]/li/div/div[1]/span/text()').extract( ) # TODO 前两条评论无法输出 yield item
def parse(self, response): for i in response.xpath('//li[@class=" j_thread_list clearfix"]'): item = TiebaItem( ) # attention! item的实例化对象一定要在for里面,也就是yield之后必须再实例化一个item item['title'] = i.xpath( './/div[@class="threadlist_title pull_left j_th_tit "]/a/text()' ).extract_first() item['author'] = i.xpath( './/a[@class="frs-author-name j_user_card "]/text()' ).extract_first() item['describ'] = i.xpath( './/div[@class="threadlist_abs threadlist_abs_onlyline "]/text()' ).extract_first().strip() item['comment_num'] = i.xpath( './/span[@class="threadlist_rep_num center_text"]/text()' ).extract_first() link = i.xpath( './/div[@class="threadlist_title pull_left j_th_tit "]/a/@href' ).extract_first() item['_id'] = response.urljoin(link) if link: yield Request(response.urljoin(link), callback=self.comment_parse, meta={'item': item}) link = item['_id'].split('/')[-1] yield Request( 'https://tieba.baidu.com/p/totalComment?t=1506043640283&tid=%s&fid=13785031&pn=1&see_lz=0' % link, callback=self.sub_comment_parse, meta={'item': item}) # yield SplashRequest(response.urljoin(link), callback=self.comment_parse, meta={'item': item},args={'wait': '0.2'}) # item['_id']=item['title'] print item['title'] for key in item: item[key] = item[key] if item[key] else "nothing" # yield item self.index += 1 a = response.xpath( '//div[@class="thread_list_bottom clearfix"]//a[@class="next pagination-item "]/text()' ).extract_first() if a: print '>>>\n', a, "第%s页" % self.index, '\n>>>' # yield {'glap': '>>>' + a + "this is %s page" % self.index} # next_page_url = response.xpath( '//div[@class="thread_list_bottom clearfix"]//a[@class="next pagination-item "]/@href' ).extract_first() if next_page_url != None: yield Request(response.urljoin(next_page_url))
def parse(self, response): item = TiebaItem() div_list = response.xpath('//div[contains(@class,"i")]') for div in div_list: item['title'] = div.xpath('./a/text()').extract_first() item['href'] = self.base_url + div.xpath( './a/@href').extract_first() item['img_list'] = [] yield scrapy.Request(url=item['href'], callback=self.parse_detail, meta=item) next_page = response.xpath('//a[text()="下一页"]/@href').extract_first() if next_page is not None: next_page = self.base_url + next_page yield scrapy.Request(url=next_page, callback=self.parse)
def parse(self, response): li_list = response.xpath( "//div[@id='pagelet_frs-list/pagelet/thread_list']/ul/li") for li in li_list: item = TiebaItem() item["标题"] = li.xpath( "./div/div[2]/div/div/a/text()").extract_first() item["回复数"] = li.xpath("./div/div[1]/span/text()").extract_first() item["详情页链接"] = li.xpath( "./div/div[2]/div/div/a/@href").extract_first() item["详情页链接"] = [ "https://tieba.baidu.com" + i for i in item["详情页链接"] ] yield scrapy.Request(item["详情页链接"], callback=self.get_next_page, mate={"item": item}) next_url = response.xpath( "//div[@class='thread_list_bottom clearfix']/div/a[10]/@href" ).extract_first() yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): html = re.findall( r'pagelet_html_frs-list/pagelet/thread_list" style="display:none;"><!--(.*?)--></code>', response.body.decode(), re.S)[0] html = etree.HTML(html) li_list = html.xpath( '//ul[@id="thread_list"]//li[@class=" j_thread_list clearfix"]') for li in li_list: item = TiebaItem() item['title'] = li.xpath('.//a/text()')[0] item['li_url'] = li.xpath('.//a/@href')[0] # print(item) item['img_list'] = [] if item['li_url']: yield response.follow(item['li_url'], callback=self.parse_detail, meta={'item': item}) next_page = html.xpath( '//div[@id="frs_list_pager"]/a[contains(@class,"next")]/@href')[0] if next_page and self.page < self.max_page: self.page += 1 yield response.follow(next_page, callback=self.parse)
def parse(self, response): # 分组获取每个帖子 tz_list = response.xpath("//div[contains(@class, 'i')]") print("-"*100 , tz_list) # 遍历取到的li标签, 取出相应信息 for div in tz_list: item = TiebaItem() item['tz_href'] = 'https://tieba.baidu.com' + div.xpath(".//a/@href").extract_first() item['tz_title'] = div.xpath(".//a/text()").extract() # print("*"*100, item) yield scrapy.Request( item['href'], callback=self.parse_detail, meta={"item":item} ) # 下一页 next_url = response.xpath("//a[text()='下一页']/@href").extract_first() if next_url is not None: yield scrapy.Request( 'http://tieba.baidu.com/mo/q---596A8CA33D57134A7383E14E264D8288%3AFG%3D1--1-3-0--2--wapp_1516511596154_469' + next_url, callback=self.parse )
def third_parse(self, response): global tmpItems ''' Get every post content ''' item_1 = response.meta['item_1'] response = response.body response = self.clean_data(response) html = Selector(text=response) page = html.xpath( '//div[@class="d_post_content j_d_post_content clearfix"]/text()' ).extract() items = [] item = TiebaItem() item['title'] = item_1['title'].encode('utf8') item['url'] = item_1['url'].encode('utf8') item['pageUrl'] = item_1['pageUrl'].encode('utf8') page = [p.strip() for p in page] item['text'] = "##".join(page) # print item items.append(item) tmpItems = [] return items
def parse(self, response): item = TiebaItem() selector = Selector(response) infos = selector.xpath('') for info in infos: try: question = info.xpath() favour = info.xpath() user = info.xpath() user_info = info.xpath() content = info.xpath() item['question'] = question item['favour'] = favour item['user'] = user item['user_info'] = user_info item['content'] = content yield item except IndexError: pass urls = [''.format(str(i)) for i in range(2, 50)] for url in urls: yield Request(url, callback=self.parse) #回调函数
def parse_post_reply(self, response): json_obj = json.loads(response.body) no = int(json_obj['no']) data_obj = json_obj['data'] tid = int(data_obj['tid']) if 'tid' in data_obj else 0 if no == 0 and tid != 0: utils.debug('评论成功:', json.dumps(json_obj['data'])) meta = response.meta['datas'] data = dict() data['fid'] = meta['fid'] data['kw'] = meta['kw'] data['tbs'] = meta['tbs'] data['tid'] = meta['tid'] data['content'] = meta['content'] data['timestamp'] = utils.timestamp item = TiebaItem(type=2) item['reply'] = data yield item time.sleep(60) yield self.post_reply(tid) else: err_code = int(json_obj['err_code']) utils.debug('评论失败:', get_post_err_msg(no, err_code, response.body)) if no == 220034 and tid != 0: time.sleep(300) yield self.post_reply(tid) if no == 40 and tid != 0: vcode_obj = json_obj['data']['vcode'] input_captcha = utils.show_captcha(vcode_obj['captcha_vcode_str']) captcha_type = vcode_obj['captcha_code_type'] yield self.__check_captcha(captcha=input_captcha, captcha_type=captcha_type) yield self.post_reply(tid, input_captcha)
def parsePage(self, response): selector = Selector(response) # content_list3 = selector.xpath("/html/body//div[@class='s_post_list']/div[@class='s_post']") content_list = selector.xpath( "/html/body//div[@class='s_post_list']/div[@class='s_post']/span[@class='p_title']/a" ) content_list_2 = selector.xpath( "/html/body//div[@class='s_post_list']/div[@class='s_post']/div[@class='p_content']" ) i = 0 for content in content_list: item = TiebaItem() title = content.xpath("string(.)").extract_first() url = content.xpath('@href').extract_first() content2 = content_list_2[i].xpath("string(.)").extract_first() i = i + 1 url = str(self.host + url) item['url'] = url item['title'] = title item['content'] = content2 print url print title yield item
def parse(self, response): li_list = response.xpath( '//*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div') for li in li_list: item = TiebaItem() item['reply_num'] = li.xpath( 'div["col2_left j_threadlist_li_left"]/span[@title="回复"]/text()' ).extract_first() item['theme'] = li.xpath( 'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/@title' ).extract_first() item['theme_site'] = li.xpath( 'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/@href' ).extract_first() item['theme_author'] = li.xpath( 'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="tb_icon_author "]/@title' ).extract_first() item['create_time'] = li.xpath( 'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="pull-right is_show_create_time"]/text()' ).extract_first() item['content'] = li.xpath( 'div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div["@class=threadlist_text pull_left"]/div/text()' ).extract_first() item['replyer'] = li.xpath( 'div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="tb_icon_author_rely j_replyer"]/@title' ).extract_first() item['reply_date'] = li.xpath( 'div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="threadlist_reply_date pull_right j_reply_data"]/text()' ).extract_first() yield item # 下一页 num = int( response.xpath( '//*[@id="frs_list_pager"]/span[@class="pagination-current pagination-item "]/text()' ).extract_first()) if num <= 10: next_page = response.xpath( '//*[@id="frs_list_pager"]/a[@class="next pagination-item "]/@href' ).extract_first() print("next_page, %s", next_page) print(num) if next_page is not None: yield response.follow(next_page, self.parse) # 回复 # //*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div/div["col2_left j_threadlist_li_left"]/span[@title="回复"]/text()') # 主题 # //*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div/div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit member_thread_title_frs "]/a/@href # //*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div/div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit member_thread_title_frs "]/a/@title # 作者 # //*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div/div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="tb_icon_author no_icon_author"]/@title # 创建时间 # //*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div/div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="pull-right is_show_create_time"]/text() # 内容 # //*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div/div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div["@class=threadlist_text pull_left"]/div/text() # 最后回复人 # //*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div/div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="tb_icon_author_rely j_replyer"]/@title # 最后回复时间 # //*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div/div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="threadlist_reply_date pull_right j_reply_data"]/text()' def getInfo(self, response): print("存入信息") li_list = response.xpath( '//*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div' ) for li in li_list: item = TiebaItem() item['reply_num'] = li.xpath( 'div["col2_left j_threadlist_li_left"]/span[@title="回复"]/text()' ).extract_first() item['theme'] = li.xpath( 'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/@title' ).extract_first() item['theme_site'] = li.xpath( 'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/@href' ).extract_first() item['theme_author'] = li.xpath( 'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="tb_icon_author "]/@title' ).extract_first() item['create_time'] = li.xpath( 'div["col2_right j_threadlist_li_right "]/div["threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="pull-right is_show_create_time"]/text()' ).extract_first() item['content'] = li.xpath( 'div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div["@class=threadlist_text pull_left"]/div/text()' ).extract_first() item['replyer'] = li.xpath( 'div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="tb_icon_author_rely j_replyer"]/@title' ).extract_first() item['reply_date'] = li.xpath( 'div["col2_right j_threadlist_li_right "]/div["threadlist_detail clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="threadlist_reply_date pull_right j_reply_data"]/text()' ).extract_first() yield item