class NovelSpider(NovelSpiderBase): # name = "NovelBookben" name = SpiderTypes.getTypeName_BookBen() def __init__(self): super().__init__() def getXpathList(self, response): return "" def getXpathMainInfo(self, response): return "" def getStrItem_Link(self, item): return "" def getStrItem_Idex(self, item): return "" def getStrMainInfo_Name(self, info): return "" def getStrMainInfo_Author(self, info): return "" def getXpathItem_Main(self, response): return "" def getStrItem_Title(self, xpath_main): return "" def getStrItem_Content(self, xpath_main): return ""
class NovelShiZhangFuRen(NovelSpiderBase): # name = "NovelLWXS520" name=SpiderTypes.getTypeName_LWXiaoShuo520() def __init__(self): super().__init__() def getXpathList(self, response): return response.xpath('//div[@id="defaulthtml4"]/table/tbody/tr/td/div[@class="dccss"]') def getXpathMainInfo(self, response): return response.xpath('//div[@id="defaulthtml4"]/table/tbody/tr/td/div[@class="dccss"]') def getStrItem_Link(self, item): part_url = item.xpath('./a/@href').extract()[0] link = self.start_urls[0] + part_url return link def getStrItem_Idex(self, item): return item.xpath('./a/text()').extract()[0] def getXpathItem_Main(self, response): return response.xpath('//table[@class="border_l_r"]/tbody/tr/td/div') def getStrItem_Name(self, xpath_main): return xpath_main.xpath('./h2/text()').extract()[0] def getStrItem_Author(self, xpath_main): return xpath_main.xpath('./div[@class="border_b"]/text()').extract()[0] def getStrItem_Title(self, xpath_main): return xpath_main.xpath('./h1/text()').extract()[0] def getStrItem_Content(self, xpath_main): return xpath_main.xpath('./table/tbody/tr/td/div/p/text()').extract()
class NovelSpider1(NovelSpiderBase): # name = "NovelQiShuLou" name = SpiderTypes.getTypeName_QiShuLou() def __init__(self): super().__init__() def getXpathList(self, response): return response.xpath('//div[@id="content-list"]/div[@class="book-list clearfix"]/ul/li') def getXpathMainInfo(self, response): return response.xpath('//div[@id="content-list"]/div[@class="book-intro clearfix"]/div[@class="book-describe"]') def getStrMainInfo_Name(self, info): return info.xpath('./h1/text()').extract()[0] def getStrMainInfo_Author(self, info): return info.xpath('./p/text()').extract()[0] def getStrItem_Link(self, item): return item.xpath('./a/@href').extract()[0] def getStrItem_Idex(self, item): return item.xpath('./a/text()').extract()[0] def getXpathItem_Main(self, response): return response.xpath('//div[@id="pagewrap"]/article[@class="post clearfix"]') def getStrItem_Title(self, xpath_main): return xpath_main.xpath('.//header[@class="post-header clearfix"]/h1/text()').extract()[0].strip().replace(' ', '').replace('\r', '').replace('\n', '').replace('\t', '') def getStrItem_Content(self, xpath_main): return xpath_main.xpath('.//text()').extract()
class NovelSpider1(NovelSpiderBase): # name = "NovelGuaZiBpi" name = SpiderTypes.getTypeName_YanQingKu() def __init__(self): super().__init__() self.headLink = "http://www.yqk.net/yanqing" def getXpathMainInfo(self, response): return response.xpath('//div[@class="base"]') def getStrMainInfo_Name(self, info): return info.xpath('./p/strong/a/text()').extract()[0] def getStrMainInfo_Author(self, info): return info.xpath('./p/a/text()').extract()[0] def getXpathList(self, response): return response.xpath('//dl[@class="chapter"]/dd') def getStrItem_Link(self, item): lastLink = item.xpath('./a/@href').extract()[0] return lastLink def getStrItem_Idex(self, item): lastLink = item.xpath('./a/@href').extract()[0] index = re.findall("(.*).html.*", lastLink)[0] return index def getXpathItem_Main(self, response): return response.xpath('//div[@class="main"]') def getStrItem_Title(self, xpath_main): title = xpath_main.xpath( './/div[@class="title"]/text()').extract()[0].strip().replace( ' ', '').replace('\r', '').replace('\n', '').replace('\t', '') lastIndex = title.find("作者") return title[0:lastIndex] def getStrItem_Author(self, xpath_main): title = xpath_main.xpath( './/div[@class="title"]/text()').extract()[0].strip().replace( ' ', '').replace('\r', '').replace('\n', '').replace('\t', '') lastIndex = title.find("作者") if lastIndex > 0: length = len(title) return title[lastIndex:length] else: return "" def getStrItem_Content(self, xpath_main): return xpath_main.xpath('.//div[@class="content"]//text()').extract()
class NovelSpider1(NovelSpiderBase): # name = "NovelSDKK88" name = SpiderTypes.getTypeName_SDKK88() def __init__(self): super().__init__() self.headLink = "http://www.sbkk88.com" def getXpathList(self, response): return response.xpath( '//div[@class="mingzhuMain"]/div[@class="mingzhuLeft"]/ul[@class="leftList"]/li' ) def getXpathMainInfo(self, response): return response.xpath( '//div[@class="mingzhuMain"]/div[@class="mingzhuLeft"]/ul[@class="leftList"]/li' ) def getStrMainInfo_Name(self, info): extract_author = info.xpath( '//div[@class="mingzhuMain"]/div[@class="mingzhuLeft"]/div[@class="mingzhuTitle"]/h1/text()' ).extract()[0] find_author = re.findall(".*:(.*)", extract_author)[0] if len(find_author) > 0: author = find_author else: author = extract_author return author def getStrItem_Link(self, item): part_url = item.xpath('./a/@href').extract()[0] link = self.headLink + part_url return link def getStrItem_Idex(self, item): return item.xpath('./a/text()').extract()[0] def getXpathItem_Main(self, response): return response def getStrItem_Author(self, xpath_main): return xpath_main.xpath('./dd')[1].xpath('./h3/text()').extract()[0] def getStrItem_Title(self, xpath_main): return xpath_main.xpath('//div[@id="f_title1"]/h1/text()').extract()[0] def getStrItem_Content(self, xpath_main): return xpath_main.xpath( '//div[@id="f_content1"]/div[@id="f_article"]/p/text()').extract()
class NovelYanYang(NovelSpiderBase): # name = "Novel2KXS" name= SpiderTypes.getTypeName_2KXiaoShuo() def __init__(self): super().__init__() self.headLink="http://www.2kxs.com" def getXpathList(self, response): return response.xpath('//dl[@class="book"]/dd') def getXpathMainInfo(self, response): return response.xpath('//div[@id="bookinfo"]/div[@id="title"]') def getStrMainInfo_Name(self, info): return info.xpath('./h1/text()').extract()[0] def getStrMainInfo_Author(self, info): return info.xpath('./address[@class="author"]/a/text()').extract()[0] def getStrItem_Link(self, item): try: part_url = item.xpath('./a/@href').extract()[0] except: part_url="" if part_url.__contains__(BooksSetting.getHtmlLast()): link = self.urls[0] + part_url else: link="" return link def getStrItem_Idex(self, item): return item.xpath('./a/text()').extract()[0] def getXpathItem_Main(self, response): return response.xpath('//div[@id="box"]') def getStrItem_Title(self, xpath_main): return xpath_main.xpath('./h2/text()').extract()[0].strip().replace(' ', '').replace( '\r', '').replace('\n', '').replace('\t', '') def getStrItem_Content(self, xpath_main): return xpath_main.xpath('./p[@class="Text"]/text()').extract()
class NovelSpider1(NovelSpiderBase): # name = "NovelShuXiangGe" name = SpiderTypes.getTypeName_ShuXiangGe() def __init__(self): super().__init__() def getXpathMainInfo(self, response): return response.xpath( '//div[@class="mu_contain"]/div[@class="info"]/div[@class="book"]') def getStrMainInfo_Name(self, info): return info.xpath('./h1/a/text()').extract()[0] def getStrMainInfo_Author(self, info): return info.xpath('./dl/dt/text()').extract()[0] def getXpathList(self, response): return response.xpath( '//div[@class="warpper"]/div[@class="mu_contain"]/ul[@class="mulu_list"]/li' ) def getStrItem_Link(self, item): part_url = item.xpath('./a/@href').extract()[0] link = self.urls[0] + part_url return link def getStrItem_Idex(self, item): return item.xpath('./a/text()').extract()[0] def getXpathItem_Main(self, response): return response.xpath('//table[@id="content"]/tbody/tr/td') def getStrItem_Title(self, xpath_main): return xpath_main.xpath('./h1/text()').extract()[0]\ .strip()\ .replace(' ','')\ .replace('\r', '')\ .replace('\n', '')\ .replace('\t', '') def getStrItem_Content(self, xpath_main): return xpath_main.xpath('./div[@id="htmlContent"]/text()').extract()
class NovelSpider1(NovelSpiderBase): # name = "NovelYueDu163" name = SpiderTypes.getTypeName_YueDu163() def __init__(self): super().__init__() self.headLink = "http://yuedu.163.com" def getXpathList(self, response): return self.getXpathMainInfo(response).xpath('./div/div/ul') def getXpathMainInfo(self, response): return response.xpath('//div[@class="g-mn"]') def getStrMainInfo_Name(self, info): return info.xpath( './div[@class="m-bookdetail"]/div[@class="f-fl"]/h3/@title' ).extract()[0] def getStrMainInfo_Author(self, info): return info.xpath( './div[@class="m-bookdetail"]/div[@class="f-fl"]/h3/span/a/text()' ).extract()[0] def getStrItem_Link(self, item): part_url = item.xpath('./a/@href').extract()[0] link = self.headLink + part_url return link def getStrItem_Idex(self, item): return item.xpath('./a/text()').extract()[0] def getXpathItem_Main(self, response): return response.xpath( '//div[@class="article J_Article"]/div[@class="portrait-page-box J_PortraitMoveBox"]/div[@class="article-content"]/div[@class="ne-content J_NEContent"]' ) def getStrItem_Title(self, xpath_main): return xpath_main.xpath('./h1/text()').extract()[0].strip().replace( ' ', '').replace('\r', '').replace('\n', '').replace('\t', '') def getStrItem_Content(self, xpath_main): return xpath_main.xpath('./p/text()').extract()
class NovelSpider1(NovelSpiderBase): # name = "NovelXinShuBao" name = SpiderTypes.getTypeName_XinShuBao() def __init__(self): super().__init__() def getXpathMainInfo(self, response): return response.xpath( '//div[@class="box_con"]/div[@id="maininfo"]/div[@id="info"]') def getStrMainInfo_Name(self, info): return info.xpath('./h1/text()').extract()[0] def getStrMainInfo_Author(self, info): return info.xpath('./p/text()').extract()[0] def getXpathList(self, response): return response.xpath('//div[@id="btycz"]/div[@id="list"]/ul/li') def getStrItem_Link(self, item): link = item.xpath('./a/@href').extract()[0] return link def getStrItem_Idex(self, item): return item.xpath('./a/text()').extract()[0] def getXpathItem_Main(self, response): return response.xpath( '//div[@class="content_read"]/div[@class="box_con"]') def getStrItem_Title(self, xpath_main): return xpath_main.xpath('./div[@class="bookname"]/h1/text()').extract()[0]\ .strip()\ .replace(' ','')\ .replace('\r', '')\ .replace('\n', '')\ .replace('\t', '') def getStrItem_Content(self, xpath_main): return xpath_main.xpath('./div[@id="content"]/text()').extract()
class NovelShiZhangFuRen(NovelSpiderBase): # name = "NovelLeWenXiaoShuo" name = SpiderTypes.getTypeName_LeWenXiaoShuo() def __init__(self): super().__init__() self.headLink = "http://www.lwxiaoshuo.com" self.web_head = BooksSetting.getHtml() self.web_last = ".html" def getXpathList(self, response): return response.xpath('//table[@style="MARGIN-BOTTOM: 10px"]/tbody') def getXpathMainInfo(self, response): return response.xpath('//table[@style="MARGIN-BOTTOM: 10px"]/tbody') def getStrItem_Name(self, xpath_main): return xpath_main.xpath('./div/h1/text()').extract()[0] def getStrItem_Author(self, xpath_main): return xpath_main.xpath( './div/div[@class="border_b"]/text()').extract()[0] def getStrItem_Link(self, item): part_url = item.xpath('.//tr/td/div[@class="dccss"]/a/@href').extract() link = self.headLink + part_url return link def getStrItem_Idex(self, item): return item.xpath('.//tr/td/div[@class="dccss"]/a/text()').extract() def getXpathItem_Main(self, response): return response.xpath('//table[@class="border_l_r"]/tbody/tr/td') def getStrItem_Title(self, xpath_main): return xpath_main.xpath( './div/h2/text()').extract()[0].strip().replace(' ', '').replace( '\r', '').replace('\n', '').replace('\t', '') def getStrItem_Content(self, xpath_main): return xpath_main.xpath('./table/tbody/tr/td/div/p/text()').extract()
class NovelSangWu(NovelSpiderBase): # name = "NovelSangWu" name = SpiderTypes.getTypeName_SangWu() start_urls = ["http://www.sangwu.org/book/5/5952/"] def __init__(self): super().__init__() def getXpathList(self, response): return response.xpath('//dd') def getXpathMainInfo(self, response): return response.xpath('//div[@class="wp b2 info_chapterlist"]/ul/li') def getStrItem_Link(self, item): part_url = item.xpath('./a/@href').extract() link = self.start_urls[0] + part_url return link def getStrItem_Idex(self, item): return item.xpath('./a/text()').extract() def getXpathItem_Main(self, response): return response.xpath('//div[@class="readmain"]') def getStrItem_Name(self, xpath_main): return xpath_main.xpath( './div[@class="bookname"]/h2/text()').extract()[0] def getStrItem_Author(self, xpath_main): return xpath_main.xpath( './div[@class="bookname"]/h2/text()').extract()[0] def getStrItem_Title(self, xpath_main): return xpath_main.xpath( './div[@class="bookname"]/h1/text()').extract()[0] def getStrItem_Content(self, xpath_main): return xpath_main.xpath('./div[@class="centent"]/text()').extract()
class NovelSpider1(NovelSpiderBase): # name = "NovelJJWXC" name = SpiderTypes.getTypeName_JJWXC() def __init__(self): super().__init__() def getXpathList(self, response): return response.xpath('//table[@class="cytable"]/tbody/tr[@itemprop="chapter"]') def getXpathMainInfo(self, response): return response.xpath('//table[@class="cytable"]/tbody/tr[@itemprop="chapter"]') def getStrItem_Name(self, xpath_main): return xpath_main.xpath('.//td[@class="noveltitle"]/h1/a/span/text()').extract()[0].strip().replace(' ', '').replace('\r', '').replace('\n', '').replace('\t', '') def getStrItem_Author(self, xpath_main): return xpath_main.xpath('.//td[@class="noveltitle"]/a/text()').extract()[0] def getStrItem_Link(self, item): try: url=item.xpath('./td/span[@itemprop="headline"]/div[@style="float:left"]/a/@href').extract()[0] except: url="" return url def getStrItem_Idex(self, item): return item.xpath('./td/span[@itemprop="headline"]/div[@style="float:left"]/a/text()').extract()[0] def getXpathItem_Main(self, response): return response.xpath('//table[@id="oneboolt"]') def getStrItem_Title(self, xpath_main): return xpath_main.xpath('.//div[@class="noveltext"]/div')[1].xpath('./h2/text()').extract()[0].strip().replace(' ', '').replace( '\r', '').replace('\n', '').replace('\t', '') def getStrItem_Content(self, xpath_main): return xpath_main.xpath('.//div[@class="noveltext"]/text()').extract()
class NovelSpider1(NovelSpiderBase): # name = "NovelBookbao8" name = SpiderTypes.getTypeName_BookBao() def __init__(self): super().__init__() self.headLink="https://www.bookbao8.com" def getXpathList(self, response): return response.xpath('//div[@class="wp b2 info_chapterlist"]/ul/li') def getXpathMainInfo(self, response): return response.xpath('//div[@class="wp b2 info_chapterlist"]/ul/li') def getStrItem_Link(self, item): part_url = item.xpath('./a/@href').extract()[0] link = self.headLink + part_url return link def getStrItem_Idex(self, item): return item.xpath('./a/text()').extract()[0] def getXpathItem_Main(self, response): return response.xpath('//div[@class="bdsub"]/dl') def getStrItem_Name(self, xpath_main): return xpath_main.xpath('./dd')[0].xpath('./h1/a/text()').extract()[0] def getStrItem_Author(self, xpath_main): return xpath_main.xpath('./dd')[1].xpath('./h3/text()').extract()[0] def getStrItem_Title(self, xpath_main): return xpath_main.xpath('./dd')[0].xpath('./h1/text()').extract()[0].strip().replace(' ', '').replace( '\r', '').replace('\n', '').replace('\t', '') def getStrItem_Content(self, xpath_main): return xpath_main.xpath('./dd[@id="contents"]/text()').extract()
class NovelSpider1(NovelSpiderBase): # name = "Novel7xxs" name = SpiderTypes.getTypeName_7xxs() def __init__(self): super().__init__() self.headLink = "http://www.7xxs.net" def getXpathList(self, response): return response.xpath('//div[@class="box_con"]/div[@id="list"]/dl/dd') def getXpathMainInfo(self, response): return response.xpath('//div[@class="box_con"]/div[@id="maininfo"]') def getStrMainInfo_Name(self, info): return info.xpath('./div[@id="intro"]/text()').extract()[0] def getStrMainInfo_Author(self, info): return info.xpath('./div[@id="info"]/p/text()').extract()[0] def getStrItem_Link(self, item): lastLink=item.xpath('./a/@href').extract()[0] link = self.headLink + lastLink return link def getStrItem_Idex(self, item): lastLink = item.xpath('./a/@href').extract()[0] index = re.findall(".*/(.*).html.*", lastLink)[0] return index def getXpathItem_Main(self, response): return response.xpath('//div[@class="content_read"]/div[@class="box_con"]') def getStrItem_Title(self, xpath_main): return xpath_main.xpath('.//div[@class="bookname"]/h1/text()').extract()[0].strip().replace(' ', '').replace('\r', '').replace('\n', '').replace('\t', '') def getStrItem_Content(self, xpath_main): return xpath_main.xpath('.//div[@id="content"]/text()').extract()
class NovelSpider1(NovelSpiderBase): # name = "NovelBQG" name = SpiderTypes.getTypeName_BiQuGuan() def __init__(self): super().__init__() def getXpathList(self, response): return response.xpath('//div[@id="wrapper"]/div[@class="box_con"]/div[@id="list"]/dl/dd') def getXpathMainInfo(self, response): return response.xpath('//div[@id="wrapper"]/div[@class="box_con"]/div[@id="maininfo"]/div[@id="info"]') def getStrMainInfo_Name(self, info): return info.xpath('./h1/text()').extract()[0] def getStrMainInfo_Author(self, info): return info.xpath('./p/text()').extract()[0] def getStrItem_Link(self, item): part_url = item.xpath('./a/@href').extract()[0] link = self.urls[0] + part_url return link def getStrItem_Idex(self, item): return item.xpath('./a/text()').extract()[0] def getXpathItem_Main(self, response): return response.xpath('//div[@class="box_con"]') def getStrItem_Title(self, xpath_main): return xpath_main.xpath('.//div[@class="bookname"]/h1/text()').extract()[0].strip().replace(' ', '').replace( '\r', '').replace('\n', '').replace('\t', '') def getStrItem_Content(self, xpath_main): return xpath_main.xpath('.//div[@id="content"]/text()').extract()
class NovelSpider(NovelSpiderBase): # name = "NovelMaoPu" name= SpiderTypes.getTypeName_MaoPu() def __init__(self): super().__init__() self.headLink="https://www.bookbao8.com" def getXpathList(self, response): return response.xpath('//div[@class="mu_contain"]/ul[@class="mulu_list"]/li') def getXpathMainInfo(self, response): return response.xpath('//div[@class="wp b2 info_chapterlist"]/ul/li') def getStrItem_Link(self, item): part_url = item.xpath('./a/@href').extract()[0] link = self.start_urls[0] + part_url return link def getStrItem_Idex(self, item): return "" def getXpathItem_Main(self, response): return response.xpath('//div[@id="content"]') def getStrItem_Name(self, xpath_main): return xpath_main.xpath('./h1/text()').extract()[0] def getStrItem_Author(self, xpath_main): return xpath_main.xpath('./h1/text()').extract()[0] def getStrItem_Title(self, xpath_main): return xpath_main.xpath('./h1/text()').extract()[0] def getStrItem_Content(self, xpath_main): return xpath_main.xpath('./div[@class="chapter-content"]/text()').extract()
def getScrapyType(self): return SpiderTypes.getTypeName_MaoPu()
def getScrapyType(self): return SpiderTypes.getTypeName_LeWenXiaoShuo()
def getScrapyType(self): return SpiderTypes.getTypeName_SDKK88()
def getScrapyType(self): return SpiderTypes.getTypeName_XS74()
def getScrapyType(self): return SpiderTypes.getTypeName_QiShuLou()
def getScrapyType(self): return SpiderTypes.getTypeName_BookBao()
def getScrapyType(self): return SpiderTypes.getTypeName_DiJiuZWW()
def getScrapyType(self): return SpiderTypes.getTypeName_LWXiaoShuo520()
def getScrapyType(self): return SpiderTypes.getTypeName_YanQingKu()
def getScrapyType(self): return SpiderTypes.getTypeName_JJWXC()
def getScrapyType(self): return SpiderTypes.getTypeName_7xxs()
def getScrapyType(self): return SpiderTypes.getTypeName_BiQuGuan()
def getScrapyType(self): return SpiderTypes.getTypeName_XinShuBao()