Exemple #1
0
 def getStrItem_Link(self, item):
     try:
         part_url = item.xpath('./a/@href').extract()[0]
     except:
         part_url=""
     if part_url.__contains__(BooksSetting.getHtmlLast()):
         link = self.urls[0] + part_url
     else:
         link=""
     return link
Exemple #2
0
 def process_item(self, item, spider):
     # print("item=", item)
     self.all.append(item)
     self.all.sort(key=lambda i: i['chapter'],
                   reverse=BooksSetting.getHtmlRegOrderReverse())
     file = open(self.out_file, 'w', encoding='UTF-8')
     file.write(item['name'] + '\n\t\t' + item['author'] + '\n')
     for index in self.all:
         file.write(index['title'] + '\n')
         file.write(index['content'] + '\n\n')
     file.close()
     return item
Exemple #3
0
 def parse(self, response):
     # self.print_response(response)
     list = self.getXpathList(response)
     info = self.getXpathMainInfo(response)
     print("info=", info.extract())
     self.item_name = self.getStrMainInfo_Name(info)
     self.item_author = self.getStrMainInfo_Author(info)
     for item in list:
         link = self.getStrItem_Link(item)
         print("link=", link)
         if link.startswith(BooksSetting.getHtml()):
             link = link
         else:
             link = BooksSetting.getHtml() + link
         if link == '': continue
         if self.urls.__contains__(link):
             continue
         else:
             index = self.getStrItem_Idex(item)
             print("index=%s, link=%s" % (index, link))
             self.urls.append(link)
             yield Request(link, method="GET", callback=self.parse_item)
             break
Exemple #4
0
 def parse_item(self, response):
     self.print_response(response)
     xpath_main = self.getXpathItem_Main(response)
     # print("xpath_main=", xpath_main)
     item = ScrapynovelItem()
     item['name'] = self.getStrItem_Name(
         xpath_main) if self.item_name.strip() == '' else self.item_name
     item['author'] = self.getStrItem_Author(
         xpath_main) if self.item_author.strip() == '' else self.item_author
     item['title'] = self.getStrItem_Title(xpath_main)
     count = re.findall(BooksSetting.getHeadHtmlReg(), response.url)[0]
     if len(count) <= 1:
         count = '0' + count
     item['chapter'] = count
     item['content'] = self.list2str(self.getStrItem_Content(xpath_main))
     # print("item=", item)
     yield item
Exemple #5
0
 def getStrItem_Name(self, xpath_main):
     return BooksSetting.getNovelName()
Exemple #6
0
class NovelSpiderBase(scrapy.spiders.Spider):
    def getXpathList(self, response):
        return ""

    def getXpathMainInfo(self, response):
        return ""

    def getStrMainInfo_Name(self, info):
        return ""

    def getStrMainInfo_Author(self, info):
        return ""

    def getStrItem_Link(self, item):
        return ""

    def getStrItem_Idex(self, item):
        return ""

    def getXpathItem_Main(self, response):
        return ""

    def getStrItem_Name(self, xpath_main):
        return BooksSetting.getNovelName()

    def getStrItem_Author(self, xpath_main):
        return ""

    def getStrItem_Title(self, xpath_main):
        return ""

    def getStrItem_Content(self, xpath_main):
        return ""

    start_urls = [BooksSetting.getHtml()]

    item_author = ""
    item_name = ""

    def __init__(self):
        self.urls = self.start_urls
        self.item_name = ""
        self.item_author = ""
        pass

    def parse(self, response):
        # self.print_response(response)
        list = self.getXpathList(response)
        info = self.getXpathMainInfo(response)
        print("info=", info.extract())
        self.item_name = self.getStrMainInfo_Name(info)
        self.item_author = self.getStrMainInfo_Author(info)
        for item in list:
            link = self.getStrItem_Link(item)
            print("link=", link)
            if link.startswith(BooksSetting.getHtml()):
                link = link
            else:
                link = BooksSetting.getHtml() + link
            if link == '': continue
            if self.urls.__contains__(link):
                continue
            else:
                index = self.getStrItem_Idex(item)
                print("index=%s, link=%s" % (index, link))
                self.urls.append(link)
                yield Request(link, method="GET", callback=self.parse_item)
                break

    def parse_item(self, response):
        self.print_response(response)
        xpath_main = self.getXpathItem_Main(response)
        # print("xpath_main=", xpath_main)
        item = ScrapynovelItem()
        item['name'] = self.getStrItem_Name(
            xpath_main) if self.item_name.strip() == '' else self.item_name
        item['author'] = self.getStrItem_Author(
            xpath_main) if self.item_author.strip() == '' else self.item_author
        item['title'] = self.getStrItem_Title(xpath_main)
        count = re.findall(BooksSetting.getHeadHtmlReg(), response.url)[0]
        if len(count) <= 1:
            count = '0' + count
        item['chapter'] = count
        item['content'] = self.list2str(self.getStrItem_Content(xpath_main))
        # print("item=", item)
        yield item

    def print_response(self, response):
        current_url = response.url  # 爬取时请求的url
        body = response.body  # 返回的html
        print("request=%s, response=%s" % (current_url, body))

    def list2str(self, list):
        s = ""
        for index in list:
            # print('index=',index)
            index.replace('\u3000', '').replace('\r', '')
            s = s + index
        return s
Exemple #7
0
 def __init__(self):
     self.all = []
     abspath = os.path.abspath('.')
     self.out_file = abspath + '\\out\\' + BooksSetting.getNovelName(
     ) + ".txt"
Exemple #8
0
 def __init__(self):
     super().__init__()
     self.headLink = "http://www.lwxiaoshuo.com"
     self.web_head = BooksSetting.getHtml()
     self.web_last = ".html"