def parse_detail(self, response): #文章的标题 title = response.xpath('//h1[@class="ph"]/text()').get() author_a = response.xpath('//p[@class="authors"]') # 文章的作者 author = author_a.xpath('./a/text()').get() #文章的发表时间 pub_time = author_a.xpath('./span/text()').get() #文章的内容 article_content = response.xpath( '//td[@id="article_content"]//text()').getall() #转为字符串 article_content = ''.join(article_content).strip() #被阅读的次数 GitHub_Star = response.xpath('//div[@class="cl"]/div/a/text()').get() #实例化一个字典来存储数据 item = WxappItem() item['title'] = title item['author'] = author item['pub_time'] = pub_time item['article_content'] = article_content item['GitHub_Star'] = GitHub_Star yield item
def parse_detail(self, response): title = response.xpath('//h1[@class="ph"]/text()').get() author_p = response.xpath('//p[@class="authors"]') author = author_p.xpath('./a/text()').get() pub_time = author_p.xpath('./span/text()').get() article_content = response.xpath('//td[@id="article_content"]//text()').getall() article_content = "".join(article_content).strip() item = WxappItem(title=title, author=author, pub_time=pub_time, content = article_content) yield item
def parse_detial(self, response): title = response.xpath("//h1[@class='ph']/text()").get() author = response.xpath("//p[@class='authors']//a/text()").get() content = response.xpath( '//td[@id="article_content"]//text()').getall() content = "".join(content).split() item = WxappItem(title=title, author=author, content=content) yield item
def parse_detail(self, response): title = response.xpath("//h1[@class='ph']/text()").get() author_p = response.xpath("//p[@class='authors']") # author = author_p.xpath(".//a/text()").get() pub_time = author_p.xpath(".//span/text()").get() # article_content = response.xpath("//td[@id='article_content']//text()").getall() # content = "".join(article_content).strip() item = WxappItem(title=title, pub_time=pub_time) yield item
def parse_detail(self, response): title = response.xpath("//h1[@class='ph']/text()").get() # 标题 author_p = response.xpath("//p[@class='authors']") # 作者的p标签 author = author_p.xpath(".//a/text()").get() # 作者 pub_time = author_p.xpath(".//span/text()").get() # 发布时间 content = response.xpath("//td[@id='article_content']//text()").getall() content = "".join(content).strip() # 文章内容 item = WxappItem(title=title, author=author, pub_time=pub_time, content=content) yield item
def parse_detail(self, response): title = response.xpath('//*[@id="ct"]/div[1]/div/div[1]/div/div[2]/div[1]/h1//text()').get() author_p = response.xpath('//*[@id="ct"]/div[1]/div/div[1]/div/div[2]/div[3]/div[1]/p') author = author_p.xpath('.//a/text()').get() pub_time = author_p.xpath('.//span//text()').get() # print('author: %s/pub_time:%s'%(author, pub_time)) content = response.xpath('//*[@id="article_content"]//text()').getall() content = "".join(content).strip() item = WxappItem(title=title, author=author, pub_time=pub_time, content=content) yield item
def parse_item(self, response): title = response.xpath(r"//h1[@class='ph']/text()").get() author = response.xpath(r"//p[@class='authors']/a/text()").get() pub_time = response.xpath(r"//span[@class='time']/text()").get() content = response.xpath(r"//div[@class='blockquote']/p/text()").get() item = WxappItem(title=title, author=author, pub_time=pub_time, content=content) yield item
def parse_detail(self, response): print("\033[0;37;40m\t=============URL===============\033[0m") print("\033[0;37;40m\t" + response.request.url + "\033[0m") title = response.xpath("//div[@class='h hm cl']/div[@class='cl']/h1/text()").get().strip() author = response.xpath("//div[@class='avatar_right cl']//p[@class='authors']/a/text()").get().strip() time = response.xpath("//div[@class='avatar_right cl']//p[@class='authors']/span/text()").get().strip() content = response.xpath("//div[@class='content_middle cl']/div[@class='d']/table//td[@id='article_content']//text()").getall() content = "".join(content).strip() item = WxappItem(title=title,author=author,time=time,content=content) yield item
def parse_detail(self, response): # print(response.text) title = response.xpath('//*[@id="ct"]/div[1]/div/div[1]/div/div[2]/div[1]/h1/text()').get() auther_info = response.xpath('///p[@class="authors"]') author = auther_info.xpath('.//a/text()').get() pub_time = auther_info.xpath('.//span/text()').get() # 爬取内容中所有的文字 并且将其转化为字符串 而且去掉左右的空格 article_content = response.xpath("//div[@class='content_middle cl']//text()").getall() content = "".join(article_content).strip() item = WxappItem(title=title,author=author,pub_time=pub_time,content=content) yield item
def parse_item(self, response): title = response.xpath("//h1[@class='ph']/text()").get() # author_p = response.xpath('//p[@class="authors"]') # author = author_p.xpath('.//a/text()').get() # pub_time = author_p.xpath('.//span/text()').get() # print('author:%s/pub_time:$s' % (author,pub_time)) wen = response.xpath('//td[@id="article_content"]//text()').getall() wen = ''.join(wen).strip() # print(wen) item = WxappItem(title=title,wen=wen) yield item
def parse_item(self, response): title = response.xpath('//h1[@class="ph"]/text()').get() authors = response.xpath('//p[@class="authors"]/a/text()').get() pub_time = response.xpath('//p[@class="authors"]/span/text()').get() content = "".join( response.xpath( '//td[@id="article_content"]//text()').getall()).strip() item = WxappItem(title=title, authors=authors, pub_time=pub_time, content=content) yield item
def parse_detail(self,reponse): title = reponse.xpath("//h1/text()").get() author_p = reponse.xpath("//p[@class='authors']") author_name =author_p.xpath("./a/text()").get() author_pubdate = author_p.xpath("./span/text()").get() content = reponse.xpath("//section/p/text()").getall() content = "".join(content).strip() print("title is %s \n"%title) print("the author is %s ,pubdate is %s"%(author_name,author_pubdate)) print("content : %s"%content) items = WxappItem(title= title,author_name=author_name,author_pubdate=author_pubdate,content=content) yield items
def parse_item(self, response): title = response.xpath("//h1[@class='ph']/text()").get() author = response.xpath("//p[@class='authors']/a/text()").get() time = response.xpath("//p[@class='authors']/span/text()").get() content = response.xpath( "//td[@id='article_content']//text()").getall() content = "".join(content).strip() item = WxappItem(title=title, author=author, pub_time=time, content=content) yield item
def parse_detall(self, response): title = response.xpath("//h1[@class='ph']/text()").get() author_1 = response.xpath("//p[@class='authors']") author = author_1.xpath(".//a/text()").get() put_time = author_1.xpath("//span[@class='time']/text()").get() article_content = response.xpath( "//td[@id='article_content']//text()").getall() content = "".join(article_content).strip() item = WxappItem(title=title, author=author, put_time=put_time, content=content) yield item
def parse_detail(self, response): title = response.xpath('//h1[@class="ph"]/text()').get().strip() author = response.xpath('//p[@class="authors"]/a/text()').get().strip() pub_time = response.xpath( ' //p[@class="authors"]/span/text()').get().strip() content = response.xpath( '//td[@id="article_content"]//text()').getall() item = WxappItem(title=title, author=author, pub_time=pub_time, content=content) yield item print('--' * 30)
def parse_detail(self, response): #这个方法命名 要避开parse这个名字 因为 底层执行的就是parse方法 #如果重写 容易冲掉底层的方法 title = response.xpath("//h1[@class='ph']/text()").get() author_p = response.xpath("//p[@class='authors']") author = author_p.xpath(".//a/text()").get() pub_time = author_p.xpath(".//span/text()").get() content = response.xpath( "//td[@id='article_content']//text()").getall() content = "".join(content).strip() item = WxappItem(author=author, title=title, pub_time=pub_time, content=content) yield item
def parse_item(self, response): title = response.xpath('//h1[@class="ph"]/text()').get() author_p = response.xpath('//p[@class="authors"]') author = author_p.xpath('.//a/text()').get() pub_time = author_p.xpath('.//span/text()').get() article_content = ''.join( response.xpath( '//td[@id="article_content"]//text()').getall()).split() item = WxappItem() item['title'] = title item['author'] = author item['pub_time'] = pub_time item['article_content'] = article_content return item
def parse_detail(self, response): title = response.xpath("//h1[@class='ph']/text()").get() res_p = response.xpath("//p[@class='authors']") author = res_p.xpath(".//a/text()").get() time = res_p.xpath(".//span/text()").get() article_content = response.xpath( "//td[@id='article_content']//text()").getall() article_content = ''.join(article_content).strip() #print(article_content) item = WxappItem(title=title, author=author, time=time, content=article_content) yield item
def parse_item(self, response): item = {} # item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() # item['name'] = response.xpath('//div[@id="name"]').get() # item['description'] = response.xpath('//div[@id="description"]').get() title = response.xpath('//h1[@class="ph"]/text()').get() authorArea = response.xpath('//div[@class="avatar_right cl"]//p') author = authorArea.xpath('./a/text()').get() pub_time = authorArea.xpath('./span/text()').get() content = response.xpath('//td[@id="article_content"]//text()').getall() content = ''.join(content).strip() item = WxappItem(title=title, author=author, pub_time=pub_time, content=content) return item
def parse_detail(self, response): title = response.xpath("//h1[@class='ph']/text()").get() author_p = response.xpath("//p[@class='authors']") author = author_p.xpath("./a/text()").get() pub_time = author_p.xpath(".//span[@class='time']/text()").get() article = author_p.xpath( "//td[@id='article_content']//text()").getall() #print('authoe:%s,time:%s' % (author,pub_time)) article = ''.join(article).strip() item = WxappItem(title=title, author=author, pub_time=pub_time, article=article) yield item
def parse_detail(self, response): # 爬取详情网页 # fp = open("wx.json", "wb") # fp.close() # 爬虫结束后,关闭文件 title = response.xpath("//*[@id='ct']/div[1]/div/div[1]/div/div[2]/div[1]/h1/text()").get() # 提起文章标题 author = response.xpath("//p[@class='authors']//a/text()").get() # 提起作者 time = response.xpath("//*[@id='ct']/div[1]/div/div[1]/div/div[2]/div[3]/div[1]/p/span/text()").get() # 提起发表时间 content = response.xpath("//td//text()").getall() # 提起文章内容 content = "".join(content).strip() # list类型转换为字符型 item = WxappItem(title=title, author=author, time=time, content=content) # 传参 yield item # item传给管道pipelines print("=" * 40) print(item) print("=" * 40)
def parse_detail(self, response): title = response.xpath('//h1[@class="ph"]/text()').get() authors = response.xpath('//p[@class="authors"]') author = authors.xpath('.//a/text()').get() pub_time = authors.xpath('.//span/text()').get() content = response.xpath( '//td[@id="article_content"]//text()').getall() content = ''.join(content).strip() # print('author:%s/pub_time:%s' % (author, pub_time)) # print(content) item = WxappItem(title=title, author=author, pub_time=pub_time, content=content) # 等同return yield item
def parse_detail(self, response): print('=' * 30) print(response.url) title = response.xpath('//h1/text()').get() authors = response.xpath('//p[@class="authors"]') author = authors.xpath('./a/text()').get() date = authors.xpath('./span/text()').get() article_content = response.xpath('//td[@id="article_content"]').get() item = WxappItem(title=title, author=author, date=date, content=article_content) yield item
def parse_item(self, response): # item = {} #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() #item['name'] = response.xpath('//div[@id="name"]').get() #item['description'] = response.xpath('//div[@id="description"]').get() title = response.xpath("//h1[@class='ph']/text()").get() author = response.xpath("//p[@class='authors']/a/text()").get() push_time = response.xpath("//p[@class='authors']/span/text()").get() content = response.xpath( "//td[@id='article_content']//text()").getall() content = "".join(content).strip() item = WxappItem(title=title, author=author, push_time=push_time, content=content) return item
def parse_detail(self, response): item = WxappItem() title = response.xpath('//div/h1[@class="ph"]/text()').get() author_p = response.xpath('//div/p[@class="authors"]') author = author_p.xpath('.//a/text()').get() pub_time = author_p.xpath('.//span[@class="time"]/text()').get() artitcle_content = response.xpath( '//td[@id="article_content"]//text()').getall() content = "".join(artitcle_content).strip() # 将其转换为字符串,并去掉空白 item['title'] = title item['author'] = author item['pub_time'] = pub_time item['content'] = content yield item
def parse_detail(self, response): title = response.xpath("//h1[@class='ph']/text()").get() author_p = response.xpath("//p[@class='authors']") author = author_p.xpath(".//a/text()").get() time = author_p.xpath(".//span/text()").get() content = response.xpath( "//td[@id='article_content']//text()").getall() content = "".join(content).strip() item = WxappItem(title=title, author=author, time=time, content=content) yield item print('author:%s/pub_time:%s' % (author, time)) print(title) print(content)
def parse_item(self, response): # item = {} #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() #item['name'] = response.xpath('//div[@id="name"]').get() #item['description'] = response.xpath('//div[@id="description"]').get() soup = BeautifulSoup(response.text, 'lxml') title = soup.select_one( '#ct > div.mn > div > div.middle_info.cl > div > div.h.hm.cl > div:nth-child(1) > h1' ).text describe = soup.select_one( '#ct > div.mn > div > div.middle_info.cl > div > div.blockquote > p' ).text print(title) print('=' * 50) # print(describe) # print('='*50) yield WxappItem(title=title, describe=describe)
def parse_detail(self, response): title = response.xpath("//h1[@class='ph']/text()").get() authors_p = response.xpath("//p[@class='authors']") author = authors_p.xpath(".//a/text()").get() pub_time = authors_p.xpath(".//span/text()").get() # print('author:%s/pub_time:%s' % (author, pub_time)) # print('=' * 60) article_content = response.xpath( "//td[@id='article_content']//text()").getall() article_content = "".join(article_content).strip() #除去空白字符 # print(article_content) # print('='*60) item = WxappItem(title=title, author=author, pub_time=pub_time, content=article_content) yield item
def parse_detail(self, response): item = {} # get title title = response.xpath('//h1[@class="ph"]/text()').get() # get title author = response.xpath('//p[@class="authors"]/a/text()').get() # get time pub_time = response.xpath('//p[@class="authors"]/span/text()').get() # get content article_content = response.xpath( '//td[@id="article_content"]//text()').getall() article_content = ''.join(article_content).strip() item = WxappItem(title=title, author=author, pub_time=pub_time, article_content=article_content) yield item
def parse_detail(self, response): print('-' * 80) # title = response.xpath('//h3[@class="list_title"]//text()').get() title = response.xpath("//h1[@class='ph']/text()").get() author = response.xpath("//p[@class='authors']/a/text()").get() pub_time = response.xpath("//span[@class='time']/text()").get() article_content = ''.join( response.xpath( "//td[@id='article_content']//text()").getall()).strip() print('title:', title, 'author:', author, 'pub_time:', pub_time) print('article_content:', article_content) item = WxappItem(title=title, author=author, pub_time=pub_time, article_content=article_content) print('-' * 80) yield item