def get_posts(url): def get_post(page_url): return parser(page_url).find_all(class_='t_fsz') # page = parser(url).find(id='fj').find('input')['size'] # page = parser(url).find_all(class_='.pg')[0].find_all('label')[0].find_all('span')[0].title # page = re.findall(r'共 (.+?) 页', page)[0] posts = [] # for i in range(1, int(page) + 1): # posts += [x for x in get_post(url[:-8] + str(i) + '-1.html')] content = '' posts = get_post(url) img_src = r"""\bsrc\b\s*=\s*[\'\"]?([^\'\"]*)[\'\"]?""" for post in posts: try: post.find(class_='pstatus').clear() # 移除 本贴最后…… post.find(class_='tip').clear() # 移除 附件:…… except AttributeError: pass post = str(post) post = re.sub('<div[^>]*>', '<p>', post) post = re.sub('<\/div[^>]*>', '</p>', post) post = post.replace('file="', 'src="') # 懒加载 for img in re.findall(img_src, post): new_img = upload_img(img) post = post.replace(img, new_img) content += html2markdown(post) return content
def get_posts(url): r = parser(url) post = str(r.find(class_='article-holder')) post = post.replace('data-src', 'src') img_src = r"""\bsrc\b\s*=\s*[\'\"]?([^\'\"]*)[\'\"]?""" for img in re.findall(img_src, post): new_img = upload_img(img) post = post.replace(img, new_img) post = html2markdown(post) return post
def get_posts(url): r = parser(url) post = str(r.find(class_='quote-content')) img_src = r"""\bsrc\b\s*=\s*[\'\"]?([^\'\"]*)[\'\"]?""" post = post.replace('data-original', 'src') place_holder = 'https://b1.hoopchina.com.cn/web/sns/bbs/images/placeholder.png' for img in re.findall(img_src, post): if img == place_holder: # 懒加载 post = post.replace(f'src="{place_holder}"', '') continue try: i = img.index('?') img_real = img[:i] except ValueError: img_real = img new_img = upload_img(img_real) post = post.replace(img, new_img) post = html2markdown(post) return post
def get_posts(url): def get_post(page_url): return parser(page_url).find_all(class_='d_post_content') page = parser(url).find_all( 'li', class_='l_reply_num')[0].find_all('span')[1].text posts = [] for i in range(1, int(page) + 1): posts += [x for x in get_post(url + '&pn=' + str(i))] content = '' img_src = r"""\bsrc\b\s*=\s*[\'\"]?([^\'\"]*)[\'\"]?""" for post in posts: post = str(post) post = re.sub('<div[^>]*>', '<p>', post) post = re.sub('<\/div[^>]*>', '</p>', post) for img in re.findall(img_src, post): new_img = upload_img(img) post = post.replace(img, new_img) content += html2markdown(post) return content
def download_img(self, url): response = self.session.get(url, headers=self.headers) content = response.content c_type = response.headers['Content-Type'] return upload_img(content, "", c_type)