Python validateFilename Examples, book_web.utils.spider_utils.validateFilename Python Examples

Example #1

0

Show file

    def parse_info(self, data):
        if data and hasattr(data, "content"):
            data = data.content.decode(self.encoding)

        doc = pq(data)
        book_name = doc('meta[property="og:title"]').attr("content")
        book_desc = (
            doc('meta[property="og:description"]').attr("content").replace(
                "\xa0", ""))
        latest_chapter_str = doc(
            'meta[property="og:novel:latest_chapter_name"]').attr("content")
        author_name = doc('meta[property="og:novel:author"]').attr("content")
        markeup = doc('meta[property="og:novel:category"]').attr("content")
        cover = doc('meta[property="og:image"]').attr("content")
        if not isinstance(cover, list):
            cover = [cover]

        info = {
            "name": validateFilename(book_name),
            "latest_chapter": validateFilename(latest_chapter_str),
            "desc": book_desc,
            "author_name": author_name,
            "markeup": markeup,
            "cover": cover,
        }
        return info

Example #2

0

Show file

File: BiqugexParser.py Project: FavoriteProjects/book_web

    def parse_info(self, data):
        if data and hasattr(data, "content"):
            data = data.content.decode(self.encoding)

        doc = pq(data)
        book_name = doc('meta[property="og:title"]').attr('content')
        book_desc = doc('meta[property="og:description"]').attr(
            'content').replace("\xa0", '')
        latest_chapter_str = doc(
            'meta[property="og:novel:latest_chapter_name"]').attr('content')
        author_name = doc('meta[property="og:novel:author"]').attr('content')
        markeup = doc('meta[property="og:novel:category"]').attr('content')
        cover = doc('meta[property="og:image"]').attr('content')
        if not isinstance(cover, list):
            cover = [cover]

        info = {
            'name': validateFilename(book_name),
            'latest_chapter': validateFilename(latest_chapter_str),
            'desc': book_desc,
            'author_name': author_name,
            'markeup': markeup,
            'cover': cover
        }
        return info

Example #3

0

Show file

 def get_content_info(self, data):
     if data and hasattr(data, "content"):
         data = data.content.decode(self.encoding)
     doc = pq(data)
     title = doc("#main > h1").text()
     content = doc("#content").text()
     return validateFilename(title), content

Example #4

0

Show file

File: BookfereParser.py Project: duanz/book_web

 def get_content_info(self, data):
     if data and hasattr(data, "content"):
         data = data.content.decode(self.encoding)
     doc = pq(data)
     title = doc(".entry-title").text()
     eles = doc(".entry-content").children()[2:]
     content = ""
     for ele in eles:
         content += f"{ele.text_content()}。\r\n"
     return validateFilename(title), content

Example #5

0

Show file

File: BookfereParser.py Project: duanz/book_web

    def parse_chapter(self, data):
        print(data)
        if data and hasattr(data, "content"):
            data = data.content.decode(self.encoding)

        doc = pq(data)
        dl_dd = doc("#main article")
        # [{第一章：http://www.a.cc/1}，第二章, ...]
        chapter_list = []

        for u in dl_dd:
            u = u.find("header").find("h1").find("a")
            link = u.get("href")
            chapter_list.append({validateFilename(u.text_content()): link})
        return chapter_list

Example #6

0

Show file

    def parse_info(self, data):
        if data and hasattr(data, "content"):
            data = data.content.decode(self.encoding)

        doc = pq(data)
        book_name = doc('.btitle>h1').text()
        book_desc = doc('p.intro').text()
        latest_chapter_str = doc(
            '#container > div.bookinfo > p.stats > span.fl > a').text()
        author_name = doc('#container > div.bookinfo > div > em > a').text()
        markeup = doc('#wrapper > div.crumbs > div.fl > a:nth-child(3)').text()
        cover = ""
        if not isinstance(cover, list):
            cover = [cover]

        info = {
            'name': validateFilename(book_name),
            'latest_chapter': validateFilename(latest_chapter_str),
            'desc': book_desc,
            'author_name': author_name,
            'markeup': markeup,
            'cover': cover
        }
        return info

Example #7

0

Show file

    def parse_info(self, data):
        if data and hasattr(data, "content"):
            data = data.content.decode(self.encoding)

        doc = pq(data)
        book_name = doc(".btitle>h1").text()
        book_desc = doc("p.intro").text()
        latest_chapter_str = doc(
            "#container > div.bookinfo > p.stats > span.fl > a").text()
        author_name = doc("#container > div.bookinfo > div > em > a").text()
        markeup = doc("#wrapper > div.crumbs > div.fl > a:nth-child(3)").text()
        cover = ""
        if not isinstance(cover, list):
            cover = [cover]

        info = {
            "name": validateFilename(book_name),
            "latest_chapter": validateFilename(latest_chapter_str),
            "desc": book_desc,
            "author_name": author_name,
            "markeup": markeup,
            "cover": cover,
        }
        return info

Example #8

0

Show file

    def parse_chapter(self, data):
        if data and hasattr(data, "content"):
            data = data.content.decode(self.encoding)

        doc = pq(data)
        dl_dd = doc('.chapterlist>dd')[9:]
        chapter_list = []
        flag = False
        for dd in dl_dd:
            flag = dd.tag == 'dd'
            if flag:
                link = pq(pq(dd)('a')).attr('href')
                chapter_list.append({
                    validateFilename(dd.text_content()):
                    self.page_base_url + link
                })

        return chapter_list

Example #9

0

Show file

 def parse_all_book(self, data):
     if data and hasattr(data, "content"):
         data = data.content.decode(self.encoding)
     doc = pq(data)
     novel_list = []
     novels = doc('#tlist > ul > li')
     for info in novels:
         title = pq(pq(info)('.zp>a')).text()
         author = pq(pq(info)('.author')).text()
         url = pq(pq(info)('.zp>a')).attr('href')
         t = {
             'title': validateFilename(title),
             'url': url,
             'label': "",
             "author": author
         }
         if not url:
             continue
         novel_list.append(t)
     return novel_list

Example #10

0

Show file

File: TwQb5Parser.py Project: duanz/book_web

 def parse_all_book(self, data):
     if data and hasattr(data, "content"):
         data = data.content.decode(self.encoding)
     doc = pq(data)
     novel_list = []
     novels = doc("#tlist > ul > li")
     for info in novels:
         title = pq(pq(info)(".zp>a")).text()
         author = pq(pq(info)(".author")).text()
         url = pq(pq(info)(".zp>a")).attr("href")
         t = {
             "title": validateFilename(title),
             "url": url,
             "label": "",
             "author": author,
         }
         if not url:
             continue
         novel_list.append(t)
     return novel_list

Example #11

0

Show file

    def parse_chapter(self, data):
        if data and hasattr(data, "content"):
            data = data.content.decode(self.encoding)

        doc = pq(data)
        dl_dd = doc("#list dl").children()[1:]
        # [{第一章：http://www.a.cc/1}，第二章, ...]
        chapter_list = []

        flag = False
        for u in dl_dd:
            if flag:
                link = u.find("a").get("href")
                chapter_list.append({
                    validateFilename(u.text_content()):
                    self.page_base_url + link
                })
            else:
                flag = u.tag == "dt"

        return chapter_list

Example #12

0

Show file

 def parse_all_book(self, data):
     if data and hasattr(data, "content"):
         data = data.content.decode(self.encoding)
     doc = pq(data)
     novel_list = []
     novels = doc('#content > div > div.details.list-type > ul > li')
     for info in novels:
         title = pq(pq(info)('.s2 > a')).text()
         author = pq(pq(info)('.s3')).text()
         url = pq(pq(info)('.s2 > a')).attr('href')
         label = pq(pq(info)('.s1')).text().replace('[',
                                                    '').replace(']', '')
         if not url:
             continue
         t = {
             'title': validateFilename(title),
             'author': author,
             'url': self.page_base_url + url,
             'label': label
         }
         novel_list.append(t)
     return novel_list

Example #13

0

Show file

 def parse_all_book(self, data):
     if data and hasattr(data, "content"):
         data = data.content.decode(self.encoding)
     doc = pq(data)
     novel_list = []
     novels = doc("#content > div > div.details.list-type > ul > li")
     for info in novels:
         title = pq(pq(info)(".s2 > a")).text()
         author = pq(pq(info)(".s3")).text()
         url = pq(pq(info)(".s2 > a")).attr("href")
         label = pq(pq(info)(".s1")).text().replace("[",
                                                    "").replace("]", "")
         if not url:
             continue
         t = {
             "title": validateFilename(title),
             "author": author,
             "url": self.page_base_url + url,
             "label": label,
         }
         novel_list.append(t)
     return novel_list

Example #14

0

Show file

 def parse_all_book(self, data):
     if data and hasattr(data, "content"):
         data = data.content.decode(self.encoding)
     doc = pq(data)
     novel_list = []
     novels = doc(".novellist")
     for block in novels:
         b = pq(block).html()
         label = pq(b)("h2").text()
         book_list = pq(pq(b)("ul"))("li")
         for info in book_list:
             title = pq(info).text()
             url = pq(pq(info)("a")).attr("href")
             t = {
                 "title": validateFilename(title),
                 "url": url,
                 "label": label
             }
             if not url:
                 continue
             novel_list.append(t)
     return novel_list

Example #15

0

Show file

File: BiqugexParser.py Project: FavoriteProjects/book_web

 def parse_all_book(self, data):
     if data and hasattr(data, "content"):
         data = data.content.decode(self.encoding)
     doc = pq(data)
     novel_list = []
     novels = doc('.novellist')
     for block in novels:
         b = pq(block).html()
         label = pq(b)('h2').text()
         book_list = pq(pq(b)('ul'))('li')
         for info in book_list:
             title = pq(info).text()
             url = pq(pq(info)('a')).attr('href')
             t = {
                 'title': validateFilename(title),
                 'url': url,
                 'label': label
             }
             if not url:
                 continue
             novel_list.append(t)
     return novel_list