def get_name(self, response): myPgae = response.body unicodePage = myPgae.decode('utf-8') # print myPgae # # 根据正则表达式拿到所有的内容 # novelsTable = re.findall(r'<ul class="main_con">(.*?)</ul>', unicodePage, re.S) # 获取当前页面的Table # print novelsTable[0] novelsList = re.findall(r'<div class="book-mid-info">(.*?)</div>', unicodePage, re.S) # 获取当前页面的Table # print novelsList # nameinfo = novelsList[0] for nameinfo in novelsList: info = re.findall(r'target="_blank".*?>(.*?)</a>', nameinfo, re.S) # 小说地址 novel_name = info[0] author = info[1] category = info[2] novelurl = "http:"+re.findall(r'<a href="(.*?)" target.*?', nameinfo, re.S)[0] serial = re.findall(r'<span >(.*?)</span>', nameinfo, re.S) serialstatus = serial[0] serialnumber = serial[1] targentcontent = XiaoshuoItem() targentcontent['novel_name'] = novel_name targentcontent['author'] = author targentcontent['novelurl'] = novelurl targentcontent['category'] = category targentcontent['serialnumber'] = serialnumber targentcontent['serialstatus'] = serialstatus # print targentcontent if novelurl is not None: yield Request(str(novelurl), dont_filter=True, callback=self.get_novelcontent, meta={'targentcontent': targentcontent})
def parse(self, response): print("==============render over===============================") #//*[@id="content"]/dd[1]/table/tbody #确定分类 catagory = response.xpath('//dl[@id="content"]/dt/h2/text()').re(u'(.+) - 文章列表')[0] for sel in response.xpath('//dl[@id="content"]/dd/table/tr[@bgcolor="#FFFFFF"]'): Item = XiaoshuoItem() name = sel.xpath('./td[1]/a[2]/text()').extract()[0] author = sel.xpath('./td[4]/text()').extract()[0] novelurl = sel.xpath('./td[2]/a/@href').extract()[0] serialstatus = sel.xpath('./td[6]/text()').extract()[0] wordsnum = sel.xpath('./td[4]/text()').extract()[0] set = novelurl.split('/') serialnum = set[len(set)-2] Item["name"] = name Item["author"] = author Item["novelurl"] = novelurl Item["serialstatus"] = serialstatus Item["wordsnum"] = wordsnum Item["serialnum"] = serialnum Item["category"] = catagory yield Item yield scrapy.Request(url=novelurl,callback = self.novel_get_parse,meta={'serial_id':serialnum}) next_page = response.xpath('//dd[@class="pages"]/div/a[@class="next"]/@href').extract()[0] #获取下一页地址 if next_page: yield scrapy.Request(next_page)
def bookParse(self, response): item = XiaoshuoItem() # 小说类型 item['noveltype'] = response.xpath( '//meta[contains(@property, "og:novel:category")]/@content' ).extract()[0] # 小说作者 item['novelauthor'] = response.xpath( '//meta[contains(@property, "og:novel:author")]/@content').extract( )[0] # 小说名字 item['novelname'] = response.xpath( '//meta[contains(@property, "og:novel:book_name")]/@content' ).extract()[0] # 小说连载状态 item['novelstatus'] = response.xpath( '//meta[contains(@property, "og:novel:status")]/@content').extract( )[0] # 小说更新时间 item['updatetime'] = response.xpath( '//meta[contains(@property, "og:novel:update_time")]/@content' ).extract()[0] # 小说简介 item['novelsummary'] = "".join( response.xpath('//div[contains(@id, "waa")]/text()').extract()) # 小说链接 item['novelurl'] = response.xpath( '//a[contains(@class, "reader")]/@href').extract()[0] book_url = response.xpath( '//a[contains(@class, "reader")]/@href').extract()[0] yield scrapy.Request(url=book_url, callback=self.chapterListParser)
def parse(self, response): item = XiaoshuoItem() item['novel_name'] = response.xpath( '//ul[@class="all-img-list cf"]//li[re:match(@data-rid,"\d")]/div[@class="book-mid-info"]/h4/a/text()' ).extract() item['novel_writter'] = response.xpath( '//ul[@class="all-img-list cf"]//li/div[2]/p[@class="author"]/a[@class="name"]/text()' ).extract() item['novel_main_type'] = response.xpath( '//ul[@class="all-img-list cf"]//li/div[2]/p[@class="author"]/a[2]/text()' ).extract() item['novel_sub_type'] = response.xpath( '//ul[@class="all-img-list cf"]//li/div[2]/p[@class="author"]/a[3]/text()' ).extract() item['novel_status'] = response.xpath( '//ul[@class="all-img-list cf"]//li/div[2]/p[@class="author"]/span/text()' ).extract() item['novel_img_url'] = response.xpath( '//ul[@class="all-img-list cf"]//li/div[1]//img/@src').extract() item['novel_real_url'] = response.xpath( '//ul[@class="all-img-list cf"]//li/div[2]/h4/a/@href').extract() for novel_dir, img_url in zip(item['novel_name'], item['novel_img_url']): if os.path.exists('D:\spider_data\\' + novel_dir): pass else: os.mkdir('D:\spider_data\\' + novel_dir) os.chdir('D:\spider_data\\' + novel_dir) img_resp = requests.get('https:' + img_url, headers=self.headers) with open(novel_dir + '.jpg', 'wb') as fp: fp.write(img_resp.content) fp.close() sleep(1) yield item
def full_text_parse(self, response): html = BeautifulSoup(response.text, 'lxml') content = html.find('div', {'id': 'content'}).text next = html.find('div', {'class': 'bottem2'}).find_all('a')[4]['href'] chapter = html.find('div', {'class': 'bookname'}).find('h1').text name = html.find('div', {'class': 'con_top'}).find_all('a')[1].text author = response.meta['author'] Item = XiaoshuoItem() Item['chapter'] = chapter Item['name'] = name Item['content'] = content Item['author'] = author print(Item['author']) print(Item['name']) print(Item['chapter']) print(Item['content']) yield Item while 1: try: request = scrapy.Request( url='http://www.biquge.tv{}'.format(next), callback=self.full_text_parse, dont_filter=True) request.meta['author'] = author yield request break except: pass
def parse_itme1(self, response): itme = XiaoshuoItem() url = response.xpath('//ul/li/a[@rel="nofollow"]/@href').extract()[1] itme['file_urls'] = [url] itme['names'] = response.meta['name'] itme['leibie'] = response.meta['leibie'] yield itme
def parse(self, response): lists = response.xpath('//ul[@class="all-img-list cf"]/li') for i in lists: item = XiaoshuoItem() #以字典形式 #item['li'] = i.xpath('./@data-rid').extract()[0] item['name'] = i.xpath('./div[@class="book-mid-info"]/h4/a/text()').extract()[0] item['author'] = i.xpath('./div[@class="book-mid-info"]/p[@class="author"]/a[@class="name"]/text()').extract()[0] yield item
def parse_detail(self, response): # 小说名字 name = response.meta['name'] # 章节名字 chapter_name = response.xpath("//h1/text()").get() # 章节内容 content = response.xpath("//div[@id='content']//text()").getall() content = "".join(content) yield XiaoshuoItem(name=name, chapter_name=chapter_name, content=content)
def parse(self, response): if re.match('https://api.youshuge.com/new_home', response.url): s = response.body.decode('unicode_escape') while s != "}": s = s[s.find('"id":'):] if s[5:s.find(',"')]: l.append(s[5:s.find(',"')]) s = s[s.find(',"'):] else: for i in l[:1]: yield scrapy.FormRequest( url="https://api.youshuge.com/getbookinfo", formdata={ "token": token, "id": i }, callback=self.parse) elif re.match('https://api.youshuge.com/getbookinfo', response.url): d = eval( response.body.decode('unicode_escape').replace( '\\', '').replace('\r\n', '').replace('""', '"').replace('null', '""')) yield scrapy.FormRequest(url="https://api.youshuge.com/getcontent", headers=headers, formdata={ "token": token, "bookid": str(d['data']['id']), 'chapteid': str(d['data']['read_chapte']) }, callback=self.parse) elif re.match('https://api.youshuge.com/getcontent', response.url): d = eval( response.body.decode('unicode_escape').replace( '\\', '').replace('\r\n', '').replace('""', '"').replace('null', '""')) if d['msg'] != '余额不足': item = XiaoshuoItem() item['book_id'] = d['data']['book_id'] item['chapte_id'] = d['data']['chapte']['id'] item['chapte_name'] = d['data']['chapte_name'] item['content'] = d['data']['content'] yield item if d['data']['next_chapte']: yield scrapy.FormRequest( url="https://api.youshuge.com/getcontent", headers=headers, formdata={ "token": token, "bookid": str(d['data']['book_id']), 'chapteid': str(d['data']['next_chapte']) }, callback=self.parse)
def dataParse(self, response): for i in response.css("html"): item = XiaoshuoItem() try: item["zuozhe"] = i.css(".w2 > a::text")[0].extract() except: pass try: item["timu"] = i.css("h1 > a::text")[0].extract() except: pass yield item
def get_novelcontent(self, response): # print response.body novel_name = response.meta['name'] # 小说名字 author = response.meta['author'] # 小说作者 novelurl = response.url # 小说地址 # print novelurl click_num_total = response.xpath( '//tr[1]/td[1]/text()').extract_first() # 点击 if click_num_total: # print "changshi1: "+str(click_num_total) click_num_total = int(click_num_total.split(":")[1]) # print "changshi2: " + str(click_num_total) collect_num_total = response.xpath( '//tr[1]/td[2]/text()').extract_first() # 收藏 if collect_num_total: collect_num_total = int(collect_num_total.split(":")[1]) # print collect_num_total click_num_month = response.xpath( '//tr[1]/td[3]/text()').extract_first() # 收藏 if click_num_total: click_num_month = int(click_num_month.split(":")[1]) * 4 # print click_num_month serialnumber = response.xpath( '//tr[1]/td[4]/text()').extract_first() # 连载字数 if serialnumber: serialnumber = int(serialnumber.split(":")[1]) # print serialnumber # category = response.xpath('//div[@class="title"]/i[2]/text()').extract_first() # 小说类别 serialstatus = response.xpath( '//div[@class="title"]/i[2]/text()').extract_first() # 小说类别 # print category category = response.xpath( '//div[@class="title"]/a[2]/text()').extract_first() # 状态 # print serialstatus targentcontent = XiaoshuoItem() targentcontent['novel_name'] = novel_name.strip() targentcontent['author'] = author.strip() targentcontent['novelurl'] = novelurl targentcontent['serialstatus'] = serialstatus targentcontent['serialnumber'] = serialnumber targentcontent['category'] = category targentcontent['collect_num_total'] = int(collect_num_total) targentcontent['click_num_total'] = int(click_num_total) targentcontent['click_num_month'] = int(click_num_month) # targentcontent['name_id'] = name_id # targentcontent['novel_breif'] = novel_breif yield targentcontent
def parse_item(self, response): chap_list = response.xpath(".//*[@class='listmain']/dl/dd") novel_name = response.xpath( ".//div[@id='book']//div[@id='info']/h1/text()").get() for chapter in chap_list: c_name = chapter.xpath('./a/text()').get() c_url = chapter.xpath('./a/@href').get() if c_name: item = XiaoshuoItem(c_name=c_name, novel_name=novel_name) url = response.urljoin(c_url) request = scrapy.Request(url=url, callback=self.parse_content, dont_filter=True) request.meta['key'] = item yield request
def content(self, response): urls = response.xpath('//*[@id="list"]/dl/dd') for url in urls: item = XiaoshuoItem() item['title'] = response.xpath( '//*[@id="info"]/h1/text()').extract_first() author = response.xpath( '//*[@id="info"]/p[1]/text()').extract_first() item['author'] = re.sub(r'\xa0|\n|\r', '', author) item['last'] = response.xpath( '//*[@id="info"]/p[3]/text()').extract_first() body_url = url.xpath('a/@href').extract_first() body_url = 'http://www.biquge.com.tw' + body_url request = scrapy.Request(body_url, self.body, dont_filter=True) request.meta['item'] = item yield request
def content_parse(self, response): """每章内容""" if response.status == 200: html = etree.HTML(response.text) if html: title = html.xpath('//em[@class="l"]/text()') page = html.xpath('//strong[@class ="l jieqi_title"]/text()') content = html.xpath( '//div[contains(@class, "mainContenr") and @id="content"]/text()' ) item = XiaoshuoItem() item['title'] = title[0] if title else '' item['page'] = page[0] if page else '' item['content'] = content yield item
def get_name(self, response): myPgae = response.body unicodePage = myPgae.decode('utf-8') # print myPgae # # 根据正则表达式拿到所有的内容 # novelsTable = re.findall(r'<ul class="main_con">(.*?)</ul>', unicodePage, re.S) # 获取当前页面的Table # print novelsTable[0] novelsList = re.findall(r'<li>(.*?)</li>', unicodePage, re.S) # 获取当前页面的Table # nameinfo = novelsList[17] if novelsList[17:66]: for nameinfo in novelsList[17:66]: info = re.findall(r'target="_blank">(.*?)</a>', nameinfo, re.S) # 小说地址 # print info[1] category = info[0] novel_name = info[1] author = info[3] # print author novelurl = re.findall(r'<a class="fs14" href="(.*?)" title.*?', nameinfo, re.S)[0] serialnumber = re.findall(r'<span class="number">(.*?)</span>', nameinfo, re.S)[0] # print serialnumber # category = nameinfo.xpath('li/span[1]/a/text()').extract()[0] # print category # novel_name = nameinfo.xpath('li/span[2]/a[1]/text()').extract()[0] # print novel_name # novelurl = nameinfo.xpath('li/span[2]/a[1]/@href').extract()[0] # print novelurl # serialnumber = nameinfo.xpath('li/span[3]/text()').extract()[0] # print int(serialnumber) # author = nameinfo.xpath('li/span[4]/a/text()').extract()[0] # print author targentcontent = XiaoshuoItem() targentcontent['novel_name'] = novel_name.strip() targentcontent['author'] = author.strip() targentcontent['novelurl'] = novelurl targentcontent['category'] = category targentcontent['serialnumber'] = int(serialnumber) # print targentcontent if novelurl is not None: yield Request(str(novelurl), dont_filter=True, callback=self.get_novelcontent, meta={'targentcontent': targentcontent})
def get_name(self, response): myPgae = response.body unicodePage = myPgae.decode('utf-8') # print myPgae # # 根据正则表达式拿到所有的内容 # novelsTable = re.findall(r'<ul class="main_con">(.*?)</ul>', unicodePage, re.S) # 获取当前页面的Table # print novelsTable[0] novelsList = re.findall(r'<tr class=.*?>(.*?)</tr>', unicodePage, re.S) # 获取当前页面的Table # print len(novelsList) # nameinfo = novelsList[0] if novelsList: for nameinfo in novelsList: # print nameinfo novelurl = re.findall(r'<a .*? href="(.*?)" target.*?', nameinfo, re.S)[0] # print novelurl info = re.findall(r'target="_blank".*?>(.*?)</a>', nameinfo, re.S) # 小说地址 category = info[0] novel_name = info[1] author = info[-1] # print novel_name+" "+ author + " " + category serialnumber = re.findall(r'<td class="td5">(.*?)</td>', nameinfo, re.S)[0] # print serialnumber serialstatus = re.findall(r'<em class="fc2">(.*?)</em>', nameinfo, re.S)[0] serialstatus = serialstatus.strip() targentcontent = XiaoshuoItem() targentcontent['novel_name'] = novel_name.strip() targentcontent['author'] = author.strip() targentcontent['novelurl'] = novelurl targentcontent['category'] = category targentcontent['serialnumber'] = serialnumber targentcontent['serialstatus'] = serialstatus # return "" # print targentcontent # novelurl = "http://www.17k.com/book/1893454.html" # print novelurl if novelurl is not None: yield Request(str(novelurl), dont_filter=True, callback=self.get_novelcontent, meta={'targentcontent': targentcontent})
def parse(self, response): kind_urls = response.xpath( "//div[@class='menu']/ul/li/a/@href").extract()[1:8] kind_names = response.xpath( "//div[@class='menu']/ul/li/a/span/text()").extract()[1:8] items = [] for each in range(len(kind_names)): if not os.path.exists(kind_names[each]): os.makedirs(kind_names[each]) for offset in range(1, 11): item = XiaoshuoItem() item['kind_name'] = kind_names[each] item['kind_url'] = kind_urls[each] + str(offset) + '.htm' items.append(item) for item in items: yield scrapy.Request(item['kind_url'], meta={'meta': item}, callback=self.parse_second)
def parse(self, response): ''' book names ''' item = [] items = [] book_names_As = response.xpath("//*[@id="hotcontent"]/div[1]") for book_name_A in book_names_As : item = XiaoshuoItem() item['novel_name'] = book_name_A.xpath('./div[1]/dl/dt/a/text()').extract()[0] item['author'] = book_name_A.xpath('./div[1]/dl/dt/span/text()').extract()[0] item['novel_name_urls'] = book_name_A.xpath('//*[@id="hotcontent"]/div[1]/div[1]/dl/dt/a/@href') items.append(item) return items
def parse_second(self, response): meta = response.meta['meta'] items = [] novel_urls = response.xpath("//li[@class='conter1']/a/@href").extract() novel_names = response.xpath( "//li[@class='conter1']/a/text()").extract() for novel_url, novel_name in zip(novel_urls, novel_names): item = XiaoshuoItem() novel_url = 'http://www.530p.com' + novel_url file_novel = meta['kind_name'] + '\\' + novel_name + '.txt' item['kind_name'] = meta['kind_name'] item['kind_url'] = meta['kind_url'] item['novel_name'] = novel_name item['novel_url'] = novel_url item['file_novel'] = file_novel items.append(item) for item in items: yield scrapy.Request(item['novel_url'], meta={'meta1': item}, callback=self.parse_third)
def get_name(self, response): myPgae = response.body unicodePage = myPgae.decode('utf-8') # print myPgae # # 根据正则表达式拿到所有的内容 # novelsTable = re.findall(r'<ul class="main_con">(.*?)</ul>', unicodePage, re.S) # 获取当前页面的Table # print novelsTable[0] # max_num = response.xpath('//div[@class="topbox"]/span/text()').extract_first().split(u"本")[0] novelsList = re.findall(r'<div class="bookdetail bg">(.*?)</div>', unicodePage, re.S) # 获取当前页面的Table # print len(novelsList) if novelsList: for nameinfo in novelsList[1:31]: # print nameinfo info = re.findall(r'target="_blank".*?>(.*?)</a>', nameinfo, re.S) # 小说地址 novel_name = info[0] author = info[1] category = info[2] novelurl = re.findall(r'<a href="(.*?)" target.*?', nameinfo, re.S)[0] # serial = re.findall(r'<span >(.*?)</span>', nameinfo, re.S) # serialstatus = serial[0] # serialnumber = serial[1] targentcontent = XiaoshuoItem() targentcontent['novel_name'] = novel_name.strip() targentcontent['author'] = author.strip() targentcontent['novelurl'] = novelurl targentcontent['category'] = category # < span class ="book_click" > 2013-10-21 < / span > update = re.findall(r'<span class="book_click">(.*?)</span>', nameinfo, re.S)[0].split("-")[0] if update > 2017: targentcontent['serialstatus'] = u"连载中" else: targentcontent['serialstatus'] = u"已完结" if novelurl is not None: yield Request(str(novelurl), dont_filter=True, callback=self.get_novelcontent, meta={'targentcontent': targentcontent})
def get_name(self, response): baseurl = response.url # print baseurl myPgae = response.body unicodePage = myPgae.decode('utf-8') novelsList = re.findall(r'<li>(.*?)</li>', unicodePage, re.S) # 获取当前页面的Table print len(novelsList) # nameinfo = novelsList[10] # 10-29 # nameinfo2 = novelsList[1] # print nameinfo for nameinfo in novelsList[10:30]: novel_name = re.findall(r'target="_blank".*?>(.*?)</a>', nameinfo, re.S)[1] # 小说地址 # print name novelInfo = re.findall(r'target="blank".*?>(.*?)</a>', nameinfo, re.S) # 小说地址 # print novelInfo author = novelInfo[0].split(">")[2] category = novelInfo[1] novelurl = re.findall(r'href="(.*?)"', nameinfo, re.S)[0] novelurl = self.baseurl + novelurl info = re.findall(r'<span>(.*?)</span>', nameinfo, re.S) # 小说地址 serialstatus = info[0] click_num_month = int(info[1].split(":")[1]) serialnumber = int(info[5].split(":")[1]) targentcontent = XiaoshuoItem() targentcontent['novel_name'] = novel_name targentcontent['author'] = author targentcontent['novelurl'] = novelurl targentcontent['serialstatus'] = serialstatus targentcontent['serialnumber'] = serialnumber targentcontent['category'] = category targentcontent['click_num_month'] = int(click_num_month) if novelurl is not None: yield Request(str(novelurl), dont_filter=True, callback=self.get_novelcontent, meta={'targentcontent': targentcontent})
def parse(self, response): item = XiaoshuoItem() sel = Selector(response) oldUrl = sel.xpath('//div[@class = "fanye_cen"]/a/@href').extract() newUrl = "http://www.uukanshu.com" + oldUrl[0] if newUrl[-1] != "/": item['page'] = oldUrl[0] yield Request(newUrl, callback=self.parse) else: item['page'] = response.url[23:] print('下载完成') item['content'] = sel.xpath( '//div[@id="contentbox"]/text()|//div[@id="contentbox"]/p/text()').extract() item['title'] = sel.xpath( '//div[@class = "h1title"]/h1/text()').extract() yield item
def parse_third(self, response): meta1 = response.meta['meta1'] items = [] part_urls = response.xpath( "//div[@class='clc']/a/@href").extract()[::-1] # 章节url part_names = response.xpath( "//div[@class='clc']/a/text()").extract()[::-1] # 章节名字 for part_url, part_name in zip(part_urls, part_names): item = XiaoshuoItem() part_url = 'http://www.530p.com' + part_url item['kind_name'] = meta1['kind_name'] item['kind_url'] = meta1['kind_url'] item['novel_name'] = meta1['novel_name'] item['novel_url'] = meta1['novel_url'] item['file_novel'] = meta1['file_novel'] item['part_url'] = part_url item['part_name'] = part_name items.append(item) for item in items: yield scrapy.Request(item['part_url'], meta={'meta2': item}, callback=self.parse_forth)
def parse(self, response): for each in response.xpath("//div[@class='table_con']"): item = XiaoshuoItem() # 小说类型 storystyle = each.xpath( ".//span[@class='book']/em/a/text()").extract() # 小说名 storyname = each.xpath( ".//span[@class='book']//a[@class='f14']/text()").extract() # 小说章节 storychapter = each.xpath( ".//span[@class='book']/a[@target='_blank'][2]/text()" ).extract() # 总点击 storyclick = each.xpath(".//span[@class='click']/text()").extract() # 作者 storyauthor = each.xpath( ".//span[@class='author']/a/text()").extract() # 更新时间 storyupdatetime = each.xpath( ".//span/span[@class='time']/text()").extract() for i in range(0, 50): item['storyStyle'] = storystyle[i].strip() item['storyName'] = storyname[i].strip() item['storyChapter'] = storychapter[i].strip() item['storyClick'] = storyclick[i].strip() item['storyAuthor'] = storyauthor[i].strip() item['storyUpdateTime'] = storyupdatetime[i].strip() yield item if self.index < 5: self.index += 1 yield scrapy.Request("http://huayu.zongheng.com/store/c0/c0/u1/p" + str(self.index) + "/v0/s0/ALL.html", callback=self.parse)