예제 #1
0
    def _process_chapter(self,
                         ch_num,
                         ch_link,
                         series_dir,
                         series_name,
                         pages=None):
        print('Downloading chapter #{} from {}'.format(ch_num, ch_link))

        ch_dir = '{}/{}'.format(series_dir, ch_num)

        if not os.path.exists(ch_dir):
            os.makedirs(ch_dir)

        page_source = get_page_source(ch_link)
        page_list = self._get_page_list(page_source)

        if series_name not in self._PROGRESS:
            self._PROGRESS[series_name] = {}

        if ch_num not in self._PROGRESS[series_name]:
            self._PROGRESS[series_name][ch_num] = {
                'progress': 0,
                'max': len(page_list)
            }

        for index, page in enumerate(page_list):
            # If index is not in pages, then skip.
            # For single page download
            if pages:
                if not index + 1 in pages:
                    continue

            if index != 0:
                time.sleep(random.uniform(1.5, 3.0))

                if not self._ONE_PAGE:
                    page_source = get_page_source(page)

            if self._ONE_PAGE:
                img_link = page
            else:
                img_link = self._get_image(page_source)[0]

            jpg = '{:02d}.jpg'.format(index + 1)
            file_dir = '{}/{}'.format(ch_dir, jpg)

            # print('  Chapter #{} - Started downloading page #{}.'.format(ch_num, i))

            download_image(img_link, file_dir)
            self._PROGRESS[series_name][ch_num]['progress'] = index + 1

            print('  Chapter #{} - Downloaded page #{}'.format(
                ch_num, index + 1))

        print('Done with chapter {}!'.format(ch_num))

        self._PROGRESS[series_name][ch_num]['progress'] = 'completed'

        return len(page_list)
예제 #2
0
    def _process_chapter(self, ch_num, ch_link, series_dir, series_name, pages=None):
        print('Downloading chapter #{} from {}'.format(ch_num, ch_link))

        ch_dir = '{}/{}'.format(series_dir, ch_num)

        if not os.path.exists(ch_dir):
            os.makedirs(ch_dir)

        page_source = get_page_source(ch_link)
        page_list = self._get_page_list(page_source)

        if series_name not in self._PROGRESS:
            self._PROGRESS[series_name] = {}

        if ch_num not in self._PROGRESS[series_name]:
            self._PROGRESS[series_name][ch_num] = {
                'progress': 0,
                'max': len(page_list)
            }

        for index, page in enumerate(page_list):
            # If index is not in pages, then skip.
            # For single page download
            if pages:
                if not index + 1 in pages:
                    continue

            if index != 0:
                time.sleep(random.uniform(1.5, 3.0))

                if not self._ONE_PAGE:
                    page_source = get_page_source(page)

            if self._ONE_PAGE:
                img_link = page
            else:
                img_link = self._get_image(page_source)[0]

            jpg = '{:02d}.jpg'.format(index + 1)
            file_dir = '{}/{}'.format(ch_dir, jpg)

            # print('  Chapter #{} - Started downloading page #{}.'.format(ch_num, i))

            download_image(img_link, file_dir)
            self._PROGRESS[series_name][ch_num]['progress'] = index + 1

            print('  Chapter #{} - Downloaded page #{}'.format(ch_num, index + 1))

        print('Done with chapter {}!'.format(ch_num))

        self._PROGRESS[series_name][ch_num]['progress'] = 'completed'

        return len(page_list)
예제 #3
0
def get_zhihu_video_download_url(video_url):
    page_source = utils.get_page_source(video_url)
    src = etree.HTML(page_source).xpath(
        '//*[@id="player"]/div/div/div[1]/video/@src')
    if len(src) == 0:
        logger.error('没有发现视频链接')
        return None
    return src[0]
예제 #4
0
    def _get_search_result(self, search_value, re_compile, search):
        url = self._SEARCH_LINK
        page_source = None

        if self._SEARCH_REST == 'get':
            page_source = get_page_source(url, get_values=search_value)

        if self._SEARCH_REST == 'post':
            page_source = get_page_source(url, post_values=search_value)

        re_compile = re.compile(re_compile, re.S)
        match_result = self._get_re_match_dict(page_source, re_compile, search)

        for match in match_result:
            match['title'] = match['title'].strip()

            if self._REL_LINK:
                match['url'] = self._get_abs_link(match['url'])

        return match_result
예제 #5
0
    def _get_search_result(self, search_value, re_compile, search):
        url = self._SEARCH_LINK
        page_source = None

        if self._SEARCH_REST == 'get':
            page_source = get_page_source(url, get_values=search_value)

        if self._SEARCH_REST == 'post':
            page_source = get_page_source(url, post_values=search_value)

        re_compile = re.compile(re_compile, re.S)
        match_result = self._get_re_match_dict(page_source, re_compile, search)

        for match in match_result:
            match['title'] = match['title'].strip()

            if self._REL_LINK:
                match['url'] = self._get_abs_link(match['url'])

        return match_result
예제 #6
0
    def _gen_chapter_list(self, url, re_compile, search):
        # TODO: Necessary?
        # Test to see if it was the same url as last time
        # to prevent from generating chapter list again.
        if not self._PREV_LINK == url:
            self._PREV_LINK = url

            page_source = get_page_source(url)
            re_compile = re.compile(re_compile, re.S)
            match_result = self._get_re_match(page_source, re_compile, search)

            if self._REL_LINK:
                match_result = list(map(lambda x: list(x), match_result))

                for match in match_result:
                    match[0] = self._get_abs_link(match[0])

            self._CHAPTER_LIST = match_result
예제 #7
0
    def _get_series_info(self, url, values, search):
        page_source = get_page_source(url)
        upper_b = page_source.find(search[0])
        lower_b = page_source.find(search[1])
        page_source = page_source[upper_b:lower_b]

        for key in values:
            value = values[key]
            re_compile = re.compile(value, re.S)
            matches = self._get_re_match(page_source, re_compile)

            # What the hell is this?
            if len(matches) == 1:
                values[key] = re.sub(r'(<a.+?>|</a>|&nbsp;)', '', matches[0], 0, re.S)
            else:
                values[key] = ''

        return values
예제 #8
0
    def _gen_chapter_list(self, url, re_compile, search):
        # TODO: Necessary?
        # Test to see if it was the same url as last time
        # to prevent from generating chapter list again.
        if not self._PREV_LINK == url:
            self._PREV_LINK = url

            page_source = get_page_source(url)
            re_compile = re.compile(re_compile, re.S)
            match_result = self._get_re_match(page_source, re_compile, search)

            if self._REL_LINK:
                match_result = list(map(lambda x: list(x), match_result))

                for match in match_result:
                    match[0] = self._get_abs_link(match[0])

            self._CHAPTER_LIST = match_result
예제 #9
0
    def _get_series_info(self, url, values, search):
        page_source = get_page_source(url)
        upper_b = page_source.find(search[0])
        lower_b = page_source.find(search[1])
        page_source = page_source[upper_b:lower_b]

        for key in values:
            value = values[key]
            re_compile = re.compile(value, re.S)
            matches = self._get_re_match(page_source, re_compile)

            # What the hell is this?
            if len(matches) == 1:
                values[key] = re.sub(r'(<a.+?>|</a>|&nbsp;)', '', matches[0],
                                     0, re.S)
            else:
                values[key] = ''

        return values