def _process_chapter(self, ch_num, ch_link, series_dir, series_name, pages=None): print('Downloading chapter #{} from {}'.format(ch_num, ch_link)) ch_dir = '{}/{}'.format(series_dir, ch_num) if not os.path.exists(ch_dir): os.makedirs(ch_dir) page_source = get_page_source(ch_link) page_list = self._get_page_list(page_source) if series_name not in self._PROGRESS: self._PROGRESS[series_name] = {} if ch_num not in self._PROGRESS[series_name]: self._PROGRESS[series_name][ch_num] = { 'progress': 0, 'max': len(page_list) } for index, page in enumerate(page_list): # If index is not in pages, then skip. # For single page download if pages: if not index + 1 in pages: continue if index != 0: time.sleep(random.uniform(1.5, 3.0)) if not self._ONE_PAGE: page_source = get_page_source(page) if self._ONE_PAGE: img_link = page else: img_link = self._get_image(page_source)[0] jpg = '{:02d}.jpg'.format(index + 1) file_dir = '{}/{}'.format(ch_dir, jpg) # print(' Chapter #{} - Started downloading page #{}.'.format(ch_num, i)) download_image(img_link, file_dir) self._PROGRESS[series_name][ch_num]['progress'] = index + 1 print(' Chapter #{} - Downloaded page #{}'.format( ch_num, index + 1)) print('Done with chapter {}!'.format(ch_num)) self._PROGRESS[series_name][ch_num]['progress'] = 'completed' return len(page_list)
def _process_chapter(self, ch_num, ch_link, series_dir, series_name, pages=None): print('Downloading chapter #{} from {}'.format(ch_num, ch_link)) ch_dir = '{}/{}'.format(series_dir, ch_num) if not os.path.exists(ch_dir): os.makedirs(ch_dir) page_source = get_page_source(ch_link) page_list = self._get_page_list(page_source) if series_name not in self._PROGRESS: self._PROGRESS[series_name] = {} if ch_num not in self._PROGRESS[series_name]: self._PROGRESS[series_name][ch_num] = { 'progress': 0, 'max': len(page_list) } for index, page in enumerate(page_list): # If index is not in pages, then skip. # For single page download if pages: if not index + 1 in pages: continue if index != 0: time.sleep(random.uniform(1.5, 3.0)) if not self._ONE_PAGE: page_source = get_page_source(page) if self._ONE_PAGE: img_link = page else: img_link = self._get_image(page_source)[0] jpg = '{:02d}.jpg'.format(index + 1) file_dir = '{}/{}'.format(ch_dir, jpg) # print(' Chapter #{} - Started downloading page #{}.'.format(ch_num, i)) download_image(img_link, file_dir) self._PROGRESS[series_name][ch_num]['progress'] = index + 1 print(' Chapter #{} - Downloaded page #{}'.format(ch_num, index + 1)) print('Done with chapter {}!'.format(ch_num)) self._PROGRESS[series_name][ch_num]['progress'] = 'completed' return len(page_list)
def get_zhihu_video_download_url(video_url): page_source = utils.get_page_source(video_url) src = etree.HTML(page_source).xpath( '//*[@id="player"]/div/div/div[1]/video/@src') if len(src) == 0: logger.error('没有发现视频链接') return None return src[0]
def _get_search_result(self, search_value, re_compile, search): url = self._SEARCH_LINK page_source = None if self._SEARCH_REST == 'get': page_source = get_page_source(url, get_values=search_value) if self._SEARCH_REST == 'post': page_source = get_page_source(url, post_values=search_value) re_compile = re.compile(re_compile, re.S) match_result = self._get_re_match_dict(page_source, re_compile, search) for match in match_result: match['title'] = match['title'].strip() if self._REL_LINK: match['url'] = self._get_abs_link(match['url']) return match_result
def _gen_chapter_list(self, url, re_compile, search): # TODO: Necessary? # Test to see if it was the same url as last time # to prevent from generating chapter list again. if not self._PREV_LINK == url: self._PREV_LINK = url page_source = get_page_source(url) re_compile = re.compile(re_compile, re.S) match_result = self._get_re_match(page_source, re_compile, search) if self._REL_LINK: match_result = list(map(lambda x: list(x), match_result)) for match in match_result: match[0] = self._get_abs_link(match[0]) self._CHAPTER_LIST = match_result
def _get_series_info(self, url, values, search): page_source = get_page_source(url) upper_b = page_source.find(search[0]) lower_b = page_source.find(search[1]) page_source = page_source[upper_b:lower_b] for key in values: value = values[key] re_compile = re.compile(value, re.S) matches = self._get_re_match(page_source, re_compile) # What the hell is this? if len(matches) == 1: values[key] = re.sub(r'(<a.+?>|</a>| )', '', matches[0], 0, re.S) else: values[key] = '' return values