Esempio n. 1
0
 def parse(self, response):
     print('1,=========================',response.url)
     text = response.text
     # print(text)
     item = TNovelItem()
     src_url = response.url
     item["src_url"] = src_url
     print('src_url:', src_url)
     product_number = ''.join(response.xpath('//h1/em/text()').extract()).strip()
     print('product_number:', product_number)
     product_number = get_product_number(product_number)
     print('product_number:', product_number)
     item["product_number"] = product_number
     plat_number = 'P33'
     print('plat_number:', plat_number)
     item["plat_number"] = plat_number
     author = ''.join(response.xpath('//h1/a[@class="writer default"]/text()').extract()).strip().replace('著','')
     item["author"] = author
     print('author:', author)
     novel_type = ';'.join(response.xpath('//p[@class="tag-box"]/span/i[position()>3]/text()').extract()).strip()
     item["novel_type"] = novel_type
     print('novel_type:', novel_type)
     tags = None
     item["tags"] = tags
     Signed_s = ''.join(response.xpath('//p[@class="tag-box"]/span/i/text()').extract()).strip()
     if '签约' in Signed_s:
         Signed = 1
     else:
         Signed = 0
     item["Signed"] = Signed
     print('Signed:', Signed)
     novel_desc = ''.join(response.xpath('//div[@class="book-information cf"]/div[@class="book-info"]/p[@class="intro"]/text()').extract()).strip()
     item["novel_desc"] = novel_desc
     print('novel_desc:', novel_desc)
     Product_image = plat_number + product_number
     Product_image = hashlib.md5(Product_image.encode(encoding='UTF-8')).hexdigest()
     print('Product_image:', Product_image)
     item["Product_image"] = Product_image
     P_image = 'http:' + ''.join(response.xpath('//*[@id="bookImg"]/img/@src').extract()).strip()
     print('P_image:', P_image)
     root = "../images//"
     path = root + Product_image
     try:
         if not os.path.exists(root):
             os.mkdir(root)
         if not os.path.exists(path):
             r = requests.get(P_image)
             r.raise_for_status()
             # 使用with语句可以不用自己手动关闭已经打开的文件流存储本地
             with open(path, "wb") as f:  # 开始写文件,wb代表写二进制文件
                 f.write(r.content)
             print("图片本地存储完成")
         else:
             print("文件已存在")
     except Exception as e:
         print("图片本地存储失败:" + str(e))
     last_modify_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
     item["last_modify_date"] = last_modify_date
     print('last_modify_date:', last_modify_date)
     yield item
Esempio n. 2
0
 def parse(self, response):
     print('1,========================', response.url)
     text = response.text
     # print(text)
     item = TNovelItem()
     src_url = response.url
     item["src_url"] = src_url
     print('src_url:', src_url)
     product_number = ''.join(
         response.xpath('//h2/text()').extract()).strip()
     print('product_number:', product_number)
     product_number = get_product_number(product_number)
     print('product_number:', product_number)
     item["product_number"] = product_number
     plat_number = 'P31'
     print('plat_number:', plat_number)
     item["plat_number"] = plat_number
     author = ''.join(
         response.xpath(
             '//div[@class="author-zone column-2"]/div[@class="right"]/a[@class="name"]//text()'
         ).extract()).strip()
     item["author"] = author
     print('author:', author)
     novel_type = ';'.join(
         response.xpath('//p[@class="infos"]/span[@class="cate"]/a/text()').
         extract()).strip()
     item["novel_type"] = novel_type
     print('novel_type:', novel_type)
     tags = None
     # if '、' in tags_s:
     #     tags = tags_s.replace('、', ';')
     # else:
     #     tags = tags_s
     item["tags"] = tags
     print('tags:', tags)
     Signed = None
     print('Signed:', Signed)
     # if '签约作品' in Signed:
     #     Signed = 1
     # else:
     #     Signed = 0
     item["Signed"] = Signed
     # print('Signed:', Signed)
     novel_desc = ''.join(
         response.xpath(
             '//div[@class="summary min-summary-height"]/pre[@class="note"]/text()'
         ).extract()).strip()
     item["novel_desc"] = novel_desc
     print('novel_desc:', novel_desc)
     Product_image = plat_number + product_number
     Product_image = hashlib.md5(
         Product_image.encode(encoding='UTF-8')).hexdigest()
     print('Product_image:', Product_image)
     item["Product_image"] = Product_image
     P_image = ''.join(
         response.xpath(
             '//div[@class="pic"]/a/img/@src//div[@class="pic"]/a/img/@src'
         ).extract()).strip()
     print('P_image:', P_image)
     root = "../images//"
     path = root + Product_image
     try:
         if not os.path.exists(root):
             os.mkdir(root)
         if not os.path.exists(path):
             r = requests.get(P_image)
             r.raise_for_status()
             # 使用with语句可以不用自己手动关闭已经打开的文件流存储本地
             with open(path, "wb") as f:  # 开始写文件,wb代表写二进制文件
                 f.write(r.content)
             print("图片本地存储完成")
         else:
             print("文件已存在")
     except Exception as e:
         print("图片本地存储失败:" + str(e))
     last_modify_date = datetime.datetime.now().strftime(
         '%Y-%m-%d %H:%M:%S')
     item["last_modify_date"] = last_modify_date
     print('last_modify_date:', last_modify_date)
     yield item
Esempio n. 3
0
 def parse(self, response):
     print('1,=====================', response.url)
     text = response.text
     # print(text)
     item = TNovelItem()
     url = response.url
     src_url = url
     item["src_url"] = src_url
     print('src_url:', src_url)
     product_number = ''.join(
         response.xpath(
             '//img[@class="qqredaer_tit"]/@title').extract()).strip()
     product_number = get_product_number(product_number)
     print('product_number:', product_number)
     item["product_number"] = product_number
     plat_number = 'P17'
     print('plat_number:', plat_number)
     item["plat_number"] = plat_number
     author = ''.join(
         response.xpath(
             '//*[@id="textauthor"]/following-sibling::p/a/text()').extract(
             )).strip()
     item["author"] = author
     print('author:', author)
     novel_type = ';'.join(
         response.xpath(
             '//div[@class="title"]/a[position()>1 and position()<last()]/text()'
         ).extract()).strip()
     item["novel_type"] = novel_type
     print('novel_type:', novel_type)
     tags = ''.join(
         response.xpath(
             '//div[@class="tags"]/text()').extract()).strip().replace(
                 '作品标签:', '')
     if tags:
         tags = tags.replace('、', ';').strip()
     item["tags"] = tags
     print('tags:', tags)
     Signed = ''.join(
         response.xpath(
             '//div[@class="tag"]/div[@class="y"]/a[@title]/text()').
         extract()).strip()
     if '签约作品' in Signed:
         Signed = 1
     else:
         Signed = 0
     item["Signed"] = Signed
     print('Signed:', Signed)
     novel_desc = ''.join(
         response.xpath('//div[@class="info"]//text()').extract()).strip()
     item["novel_desc"] = novel_desc
     print('novel_desc:', novel_desc)
     Product_image = plat_number + product_number
     Product_image = hashlib.md5(
         Product_image.encode(encoding='UTF-8')).hexdigest()
     print('Product_image:', Product_image)
     item["Product_image"] = Product_image
     P_image = 'http:' + ''.join(
         response.xpath(
             '//div[@class="cover"]/a[@class="bookcover"]/img/@src').
         extract()).strip()
     print('P_image:', P_image)
     root = "../images//"
     path = root + Product_image
     try:
         if not os.path.exists(root):
             os.mkdir(root)
         if not os.path.exists(path):
             r = requests.get(P_image)
             r.raise_for_status()
             # 使用with语句可以不用自己手动关闭已经打开的文件流存储本地
             with open(path, "wb") as f:  # 开始写文件,wb代表写二进制文件
                 f.write(r.content)
             print("图片本地存储完成")
         else:
             print("文件已存在")
     except Exception as e:
         print("图片本地存储失败:" + str(e))
     last_modify_date = datetime.datetime.now().strftime(
         '%Y-%m-%d %H:%M:%S')
     item["last_modify_date"] = last_modify_date
     print('last_modify_date:', last_modify_date)
     yield item
Esempio n. 4
0
    def parse(self, response):
        print('1,=========================', response.url)
        text = response.text
        # print(text)
        item = TNovelItem()
        src_url = response.url
        item["src_url"] = src_url
        print('src_url:', src_url)
        product_number = ''.join(
            response.xpath(
                '//div[@class="main"]/div[@class="status fl"]/h1/a/text()').
            extract()).strip()
        if '【' and '】' in product_number:
            product_number = product_number.replace('【', '[').replace('】', ']')
            print('product_number:', product_number)
            product_number = get_product_number(product_number)
            print('product_number:', product_number)
            item["product_number"] = product_number
        else:
            product_number = product_number
            product_number = get_product_number(product_number)
            print('product_number:', product_number)
            item["product_number"] = product_number
        plat_number = 'P21'
        item["plat_number"] = plat_number
        print('plat_number:', plat_number)

        author = ''.join(
            response.xpath(
                '//div[@class="main"]/div[@class="status fl"]/div[@class="booksub"]/a[@title]/text()'
            ).extract()).strip()
        print('author:', author)
        item["author"] = author
        novel_type = ''.join(
            response.xpath(
                '//div[@class="main"]/div[@class="status fl"]/div[@class="booksub"]/a[last()]/text()'
            ).extract()).strip()
        # if '-' in novel_type:
        #     novel_type = novel_type.replace('-',';')
        print('novel_type:', novel_type)
        item["novel_type"] = novel_type
        tags = response.xpath(
            '//div[@class="main"]/div[@class="status fl"]/div[@class="keyword"]/a[@title]/text()'
        ).extract()
        time.sleep(1)
        tags = ';'.join(tags)
        print('tags:', tags)
        item["tags"] = tags
        Signed = ''.join(
            response.xpath(
                '//div[@class="main"]/div[@class="status fl"]/h1/em[@class="sign"]/@title'
            ).extract()).strip()
        if '签约作品' in Signed:
            Signed = 1
        else:
            Signed = 0
        item["Signed"] = Signed
        print('Signed:', Signed)
        novel_desc = response.xpath(
            '//div[@class="main"]/div[@class="status fl"]/div[@class="info_con"]/p/text()'
        ).extract()
        novel_desc = '  '.join(''.join(novel_desc).split('\r'))
        # print(novel_desc)
        item["novel_desc"] = novel_desc
        print('novel_desc:', novel_desc)
        Product_image = plat_number + product_number
        Product_image = hashlib.md5(
            Product_image.encode(encoding='UTF-8')).hexdigest()
        print('Product_image:', Product_image)
        item["Product_image"] = Product_image
        P_image = ''.join(
            response.xpath(
                '//div[@class="main"]/div[@class="book_cover fl"]/p/a/img[@title]/@src'
            ).extract()).strip()
        print('P_image:', P_image)
        root = "../images//"
        path = root + Product_image
        try:
            if not os.path.exists(root):
                os.mkdir(root)
            if not os.path.exists(path):
                r = requests.get(P_image)
                r.raise_for_status()
                # 使用with语句可以不用自己手动关闭已经打开的文件流存储本地
                with open(path, "wb") as f:  # 开始写文件,wb代表写二进制文件
                    f.write(r.content)
                print("图片本地存储完成")
            else:
                print("文件已存在")
        except Exception as e:
            print("图片本地存储失败:" + str(e))
        last_modify_date = datetime.datetime.now().strftime(
            '%Y-%m-%d %H:%M:%S')
        item["last_modify_date"] = last_modify_date
        print('last_modify_date:', last_modify_date)

        yield item
Esempio n. 5
0
    def parse_page_p(self, response):
        print('2,=======================', response.url)
        item = response.meta["item"]
        text = response.text
        # print(response.text)
        jsons = json.loads(text)
        # print(jsons)
        data = jsons.get('data')
        # print(data)
        product_number = data.get('book_name')
        if '【' and '】' in product_number:
            product_number = product_number.replace('【', '[').replace('】', ']')
            print('product_number:', product_number)
            product_number = get_product_number(product_number)
            print('product_number:', product_number)
            item["product_number"] = product_number
        else:
            product_number = product_number
            product_number = get_product_number(product_number)
            print('product_number:', product_number)
            item["product_number"] = product_number
        plat_number = 'P19'
        item["plat_number"] = plat_number
        print('plat_number:', plat_number)

        author = data.get('author_name')
        print('author:', author)
        item["author"] = author
        novel_type = data.get('class_name')
        print('novel_type:', novel_type)
        item["novel_type"] = novel_type
        tags = None
        print('tags:', tags)
        item["tags"] = tags
        Signed = data.get('is_end_write')
        item["Signed"] = Signed
        print('Signed:', Signed)
        novel_desc = data.get('description')
        print('novel_desc:', novel_desc)
        item["novel_desc"] = novel_desc
        Product_image = plat_number + product_number
        Product_image = hashlib.md5(
            Product_image.encode(encoding='UTF-8')).hexdigest()
        print('Product_image:', Product_image)
        item["Product_image"] = Product_image
        P_image = data.get('cover')
        print('P_image:', P_image)
        root = "../images//"
        path = root + Product_image
        try:
            if not os.path.exists(root):
                os.mkdir(root)
            if not os.path.exists(path):
                r = requests.get(P_image)
                r.raise_for_status()
                # 使用with语句可以不用自己手动关闭已经打开的文件流存储本地
                with open(path, "wb") as f:  # 开始写文件,wb代表写二进制文件
                    f.write(r.content)
                print("图片本地存储完成")
            else:
                print("文件已存在")
        except Exception as e:
            print("图片本地存储失败:" + str(e))
        last_modify_date = datetime.datetime.now().strftime(
            '%Y-%m-%d %H:%M:%S')
        item["last_modify_date"] = last_modify_date
        print('last_modify_date:', last_modify_date)
Esempio n. 6
0
    def parse(self, response):
        print('1,================',response.url)
        # print(response.text)
        item = TNovelItem()
        src_url = response.url
        item["src_url"] = src_url
        print('src_url:', src_url)
        product_number = ''.join(response.xpath('//div[@class="Info Sign"]/h1/a[@target="_blank"]/text()').extract()).strip()
        print('product_number:', product_number)
        product_number = get_product_number(product_number)
        print('product_number:', product_number)
        item["product_number"] = product_number
        plat_number = 'P22'
        print('plat_number:', plat_number)
        item["plat_number"] = plat_number
        author = ''.join(response.xpath('//div[@class="author"]/a[@class="name"]/text()').extract()).strip()
        item["author"] = author
        print('author:',author)
        novel_type = ''.join(response.xpath('//dl[@class="Tab"]/dd/div[2]/table/tr[1]/td/a/text()').extract()).strip()
        item["novel_type"] = novel_type
        print('novel_type:',novel_type)
        tags_s = ';'.join(response.xpath('//dl[@class="Tab"]/dd/div[2]/table/tr[last()]/td/a/span/text()').extract()).strip()
        if '、' in tags_s:
            tags = tags_s.replace('、',';')
        else:
            tags = tags_s
        item["tags"] = tags
        print('tags:',tags)
        Signed = ''.join(response.xpath('//dl[@class="Tab"]/dd/div[2]/table/tr[1]/td/span/text()').extract()).strip()
        # print('Signed:',Signed)
        if '签约作品' in Signed:
            Signed = 1
        else:
            Signed = 0
        item["Signed"] = Signed
        print('Signed:',Signed)
        novel_desc = ''.join(response.xpath('//dl[@class="Tab"]/dd/div[1]/a//text()').extract()).strip()
        item["novel_desc"] = novel_desc
        print('novel_desc:',novel_desc)
        Product_image = plat_number + product_number
        Product_image = hashlib.md5(Product_image.encode(encoding='UTF-8')).hexdigest()
        print('Product_image:', Product_image)
        item["Product_image"] = Product_image
        P_image = ''.join(response.xpath('//div[@id="bookCover"]/a/img/@src').extract()).strip()
        print('P_image:', P_image)
        root = "../images//"
        path = root + Product_image
        try:
            if not os.path.exists(root):
                os.mkdir(root)
            if not os.path.exists(path):
                r = requests.get(P_image)
                r.raise_for_status()
                # 使用with语句可以不用自己手动关闭已经打开的文件流存储本地
                with open(path, "wb") as f:  # 开始写文件,wb代表写二进制文件
                    f.write(r.content)
                print("图片本地存储完成")
            else:
                print("文件已存在")
        except Exception as e:
            print("图片本地存储失败:" + str(e))
        last_modify_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        item["last_modify_date"] = last_modify_date
        print('last_modify_date:', last_modify_date)

        yield item
Esempio n. 7
0
    def parse(self, response):
        print('1,================',response.url)
        # print(response.text)
        item = TNovelItem()
        product_number = ''.join(response.xpath('//h1[@itemprop="name"]/span/text()').extract()).strip()
        if '【' and '】' in product_number:
            product_number = product_number.replace('【','[').replace('】',']')
            print('product_number:',product_number)
            product_number = get_product_number(product_number)
            print('product_number:',product_number)
            item["product_number"] = product_number
        else:
            product_number = product_number
            product_number = get_product_number(product_number)
            print('product_number:', product_number)
            item["product_number"] = product_number
        plat_number = 'P16'
        item["plat_number"] = plat_number
        print('plat_number:',plat_number)

        author = ''.join(response.xpath('//*[@itemprop="author"]/text()').extract()).strip()
        print('author:',author)
        item["author"] = author
        novel_type = ''.join(response.xpath('//*[@itemprop="genre"]/text()').extract()).strip()
        if '-' in novel_type:
            novel_type = novel_type.replace('-',';')
        print('novel_type:',novel_type)
        item["novel_type"] = novel_type
        tags = response.xpath('//*[@class="smallreadbody"]/span/a/text() | //div[@class="smallreadbody"]/span[@style="color: red;"]//text()').extract()
        time.sleep(1)
        tags = ';'.join(tags)
        print('tags:',tags)
        item["tags"] = tags
        Signed = ''.join(response.xpath('//div[@class="righttd"]/ul[@class="rightul"]/li[last()-1]/b//text()').extract()).strip()
        if '已签约' in Signed:
            Signed = 1
        else:
            Signed = 0
        item["Signed"] = Signed
        print('Signed:', Signed)
        novel_desc = ''.join(response.xpath('//div[@id="novelintro"]//text()').extract()).strip()
        print('novel_desc:',novel_desc)
        item["novel_desc"] = novel_desc
        Product_image = plat_number + product_number
        Product_image = hashlib.md5(Product_image.encode(encoding='UTF-8')).hexdigest()
        print('Product_image:', Product_image)
        item["Product_image"] = Product_image
        P_image =  ''.join(response.xpath('//img[@itemprop="image"]/@src').extract()).strip()
        print('P_image:',P_image)
        root = "../images//"
        path = root + Product_image
        try:
            if not os.path.exists(root):
                os.mkdir(root)
            if not os.path.exists(path):
                r = requests.get(P_image)
                r.raise_for_status()
                # 使用with语句可以不用自己手动关闭已经打开的文件流存储本地
                with open(path, "wb") as f:  # 开始写文件,wb代表写二进制文件
                    f.write(r.content)
                print("图片本地存储完成")
            else:
                print("文件已存在")
        except Exception as e:
            print("图片本地存储失败:" + str(e))
        last_modify_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        item["last_modify_date"] = last_modify_date
        print('last_modify_date:', last_modify_date)
        src_url = response.url
        item["src_url"] = src_url
        print('src_url:',src_url)
        yield item
Esempio n. 8
0
 def parse(self, response):
     print('1,=====================', response.url)
     text = response.text
     # print(text)
     item = TNovelItem()
     url = response.url
     src_url = url
     item["src_url"] = src_url
     print('src_url:', src_url)
     product_number = ''.join(
         response.xpath(
             '//*[@id="bookinfo"]/div[@class="book_info"]/h3/a[@name="readurl"]/text()'
         ).extract()).strip()
     print('product_number:', product_number)
     product_number = get_product_number(product_number)
     print('product_number:', product_number)
     item["product_number"] = product_number
     plat_number = 'P17'
     print('plat_number:', plat_number)
     item["plat_number"] = plat_number
     author = ''.join(
         response.xpath(
             '//*[@id="bookinfo"]/div[@class="book_info"]/dl/dd[@class="w_au"]/a/text()'
         ).extract()).strip()
     item["author"] = author
     print('author:', author)
     novel_type_s = ';'.join(
         response.xpath(
             '//*[@id="nav"]/a[position()>1]/text() | //dd[@class="w_auth"]/a/text()'
         ).extract()).strip()
     print('novel_type_s:', novel_type_s)
     if '/' in novel_type_s:
         novel_type = novel_type_s.replace('/', ';')
     else:
         novel_type = novel_type_s
     item["novel_type"] = novel_type
     print('novel_type:', novel_type)
     tags = None
     item["tags"] = tags
     print('tags:', tags)
     Signed = None
     item["Signed"] = Signed
     novel_desc = ''.join(
         response.xpath(
             '//*[@id="bookIntro"]/p/text() | //*[@id="bookIntro"]/text()').
         extract()).strip()
     item["novel_desc"] = novel_desc
     print('novel_desc:', novel_desc)
     Product_image = plat_number + product_number
     Product_image = hashlib.md5(
         Product_image.encode(encoding='UTF-8')).hexdigest()
     print('Product_image:', Product_image)
     item["Product_image"] = Product_image
     P_image = 'http:' + ''.join(
         response.xpath(
             '//*[@id="bookinfo"]/div[@class="bookBox"]/a/img/@src').
         extract()).strip()
     print('P_image:', P_image)
     root = "../images//"
     path = root + Product_image
     try:
         if not os.path.exists(root):
             os.mkdir(root)
         if not os.path.exists(path):
             r = requests.get(P_image)
             r.raise_for_status()
             # 使用with语句可以不用自己手动关闭已经打开的文件流存储本地
             with open(path, "wb") as f:  # 开始写文件,wb代表写二进制文件
                 f.write(r.content)
             print("图片本地存储完成")
         else:
             print("文件已存在")
     except Exception as e:
         print("图片本地存储失败:" + str(e))
     last_modify_date = datetime.datetime.now().strftime(
         '%Y-%m-%d %H:%M:%S')
     item["last_modify_date"] = last_modify_date
     print('last_modify_date:', last_modify_date)
     yield item