Ejemplo n.º 1
0
    def crawl(self, item, thread):
        match = re.search(r'nhentai.net/g/\d+', self.url)
        if not match:
            logger.info(" url not match")
            return None
        if 'https' not in self.url:
            self.url = 'https://' + self.url

        session = requests.Session()
        session.headers.update({'User-Agent': ua.get_random_ua()})
        session.proxies.update(config.PROXY)

        try:
            logger.info("fetching " + self.url)
            r = session.get(self.url)
            item.cookies = r.cookies

            selector = etree.HTML(r.text)

            en_title = selector.xpath('//*[@id="info"]/h1/text()')
            sub_title = selector.xpath('//*[@id="info"]/h2/text()')
            item.titles = sub_title + en_title
            item.author = selector.xpath('//*[@id="tags"]/div[4]/span[1]/a/text()')

            item.tags = selector.xpath('//*[@id="tags"]/div[3]/span/a/text()')
            item.language = selector.xpath('//*[@id="tags"]/div[6]/span/a/text()')
            item.image_urls = selector.xpath('//*[@id="thumbnail-container"]/div/a/img/@data-src')
            item.image_urls = list(map(convert_url, item.image_urls))
            item.source = self.url
            thread.progress = 0.05
            return item
        except ConnectionError as e:
            print(e)
            return None
Ejemplo n.º 2
0
    def crawl(self, item, thread):
        match = re.search(r'nhentai.net/g/\d+', self.url)
        if not match:
            logger.info(" url not match")
            return None
        if 'https' not in self.url:
            self.url = 'https://' + self.url

        session = requests.Session()
        session.headers.update({'User-Agent': ua.get_random_ua()})
        session.proxies.update(config.PROXY)

        try:
            logger.info("fetching " + self.url)
            r = session.get(self.url)
            item.cookies = r.cookies

            selector = etree.HTML(r.text)

            en_title = selector.xpath('//*[@id="info"]/h1/text()')
            jp_title = selector.xpath('//*[@id="info"]/h2/text()')
            item.titles = jp_title + en_title
            item.author = selector.xpath('//*[@id="tags"]/div[4]/span[1]/a/text()')

            item.tags = selector.xpath('//*[@id="tags"]/div[3]/span/a/text()')
            item.language = selector.xpath('//*[@id="tags"]/div[6]/span/a/text()')
            item.image_urls = selector.xpath('//*[@id="thumbnail-container"]/div/a/img/@data-src')
            item.image_urls = list(map(convert_url, item.image_urls))
            item.source = self.url
            thread.progress = 0.05
            return item
        except ConnectionError as e:
            print(e)
            return None
Ejemplo n.º 3
0
    def crawl(self, item, thread):
        match = re.search(r'www.wnacg.com/photos-index-aid-\d+.html', self.url)
        if not match:
            print('not match')
            return None

        session = requests.Session()
        session.headers.update({'User-Agent': ua.get_random_ua()})
        session.proxies.update(config.PROXY)
        try:
            r = session.get(self.url)
            soup = BeautifulSoup(r.text, "html.parser")

            item.titles = [soup.select('.userwrap h2')[0].string]

            item.image_urls = []
            item.image_urls += get_image_url(item, soup)

            page = int(soup.select('.f_left.paginator a')[-2].string)

            total_images_count = int(page) * len(soup.select('.li.gallary_item'))
            for i in range(2, page + 1):
                index_url = "http://www.wnacg.org/photos-index-page-%d-aid-%s.html" % (i, item.id)
                r = session.get(index_url)
                soup = BeautifulSoup(r.text, "html.parser")
                item.image_urls += get_image_url(item, soup)
                thread.progress = 0.10 * (len(item.image_urls) / total_images_count)
            return item
        except ConnectionError as e:
            print(e)
            return None
Ejemplo n.º 4
0
    def crawl(self, item, thread):
        match = re.search(r'www.wnacg.com/photos-index-aid-\d+.html', self.url)
        if not match:
            print('not match')
            return None

        session = requests.Session()
        session.headers.update({'User-Agent': ua.get_random_ua()})
        session.proxies.update(config.PROXY)
        try:
            r = session.get(self.url)
            soup = BeautifulSoup(r.text, "html.parser")

            item.titles = [soup.select('.userwrap h2')[0].string]

            item.image_urls = []
            item.image_urls += get_image_url(item, soup)

            page = int(soup.select('.f_left.paginator a')[-2].string)

            total_images_count = int(page) * len(
                soup.select('.li.gallary_item'))
            for i in range(2, page + 1):
                index_url = "http://www.wnacg.org/photos-index-page-%d-aid-%s.html" % (
                    i, item.id)
                r = session.get(index_url)
                soup = BeautifulSoup(r.text, "html.parser")
                item.image_urls += get_image_url(item, soup)
                thread.progress = 0.10 * (len(item.image_urls) /
                                          total_images_count)
            return item
        except ConnectionError as e:
            print(e)
            return None
Ejemplo n.º 5
0
    def generate(self, dir, thread, callback=None):
        self.epub = ComicEpub(dir)

        print('start to download image resources:')
        count = len(self.item.image_urls)
        thread.progress = 1 / (count + 1)

        session = requests.Session()
        session.headers.update({'User-Agent': ua.get_random_ua()})
        session.proxies.update(config.PROXY)

        for (index, url) in enumerate(self.item.image_urls):
            print('[%d/%d] %s ' % (index + 1, count, url), end='')
            sys.stdout.flush()

            r = session.get(url)
            if r.ok:
                thread.progress = (index + 1 + 1) / (count + 1)
                print('[OK]')
                image_name = url.split('/')[-1]
                is_cover = (index == 0)

                name, ext = os.path.splitext(image_name)
                self.epub.add_comic_page(r.content, ext, is_cover)
            else:
                print('[FAIL]')
                return False
        print('download completed.')
        self.epub.title = (self.item.titles[0], self.item.titles[0])
        self.epub.subjects = list(self.item.tags)
        self.epub.authors = [(self.item.author, self.item.author)]
        self.epub.publisher = ('Comicbook', 'Comicbook')

        if len(self.item.language) > 0:
            for language in self.item.language:
                if language == 'translated':
                    continue
                self.epub.language = get_language_code(language)
        else:
            if len(self.item.titles) > 0 and (
                    '漢化' in self.item.titles[0] or
                    '汉化' in self.item.titles[0] or
                    '翻譯' in self.item.titles[0]
            ):
                self.epub.language = 'zh'

        print('epubify...')
        self.epub.save()
        print('work done.')

        if callback:
            callback(self.item)
Ejemplo n.º 6
0
    def crawl(self, item, thread):
        match = re.search(r'wnacg\.org', self.url)
        if not match:
            logger.info(" url not match")
            return None
        if 'http' not in self.url:
            self.url = 'https://' + self.url

        session = requests.Session()
        session.headers.update({'User-Agent': ua.get_random_ua()})
        session.proxies.update(config.PROXY)
        try:
            r = session.get(self.url)
            selector = etree.HTML(r.text)

            title = selector.xpath('//*[@id="bodywrap"]/h2/text()')[0]

            pages = []
            img_urls = []
            page = selector.xpath(
                '//*[@id="bodywrap"]/div[2]/div/ul/li[1]/div[1]/a')[0].get(
                    'href')
            pages.append(get_full_url(page))
            while len(pages) == 1 or (len(pages) > 1
                                      and pages[0] != pages[len(pages) - 1]):
                current_page = pages[len(pages) - 1]
                p = session.get(current_page)
                sel = etree.HTML(p.text)
                img_url = sel.xpath('//*[@id="picarea"]')[0].get('src')
                img_urls.append(img_url)

                next_page = sel.xpath('/html/body/div[8]/div/div/a[2]')[0].get(
                    'href')
                pages.append(get_full_url(next_page))

            item.titles = [title]
            item.author = 'Unknown Author'
            item.tags = []
            item.image_urls = list(map(lambda url: 'https:' + url, img_urls))
            thread.progress = 0.05
            return item
        except ConnectionError as e:
            print(e)
            return None
Ejemplo n.º 7
0
    def crawl(self, item, thread):
        match = re.search(r'e-hentai.org\/g\/\d+\/\w+', self.url)
        if not match:
            print('not match')
            return None
        if 'https' not in self.url:
            self.url = 'https://' + self.url
        if self.url[-1] == '/':
            url = self.url[:-1]
        data = match.group().split('/')
        gid = data[-2]
        token = data[-1]

        session = requests.Session()
        session.headers.update({'User-Agent': ua.get_random_ua(),
                                'Referer': 'https://e-hentai.org/',
                                'Host': 'e-hentai.org',
                                'authority': 'e-hentai.org'})
        session.proxies.update(config.PROXY)

        try:
            r = session.get(url)
            soup = BeautifulSoup(r.text, "html.parser")
            en_title = soup.select('#gn')[0].string
            jp_title = soup.select('#gj')[0].string
            item.titles = []
            if jp_title != "":
                item.titles.append(jp_title)
            item.titles.append(en_title)

            tags_container = soup.select('#taglist table tbody tr')
            for container in tags_container:
                if container.select('td')[0].string == 'artist:':
                    item.author = container.select('td')[1].select('div a').string
                elif container.select('td')[0].string == 'character:':
                    pass

            nav_container = soup.select('div.gtb table.ptt tr td')
            page_num = len(nav_container) - 2

            item.image_urls = []

            for page_index in range(page_num):
                page_r = session.get(url, params={'p': page_index})
                page_soup = BeautifulSoup(page_r.text, 'html.parser')

                thumb_images_container = page_soup.select('#gdt div[class="gdtm"]')
                total_images_count = len(thumb_images_container)

                for container in thumb_images_container:
                    images_page_url = container.select('div a')[0].get('href')
                    r_images_page = session.get(images_page_url)

                    soup_images_page = BeautifulSoup(r_images_page.text, "html.parser")
                    imgs = soup_images_page.select('.sni a img')

                    for img in imgs:
                        if re.search(r'/h/', img['src']):
                            item.image_urls.append(img['src'])
                            thread.progress = 0.15 * (len(item.image_urls) / total_images_count)
                            print(img['src'])

            return item
        except ConnectionError as e:
            print(e)
            return None