Example #1
0
def get_posts(url):
    def get_post(page_url):
        return parser(page_url).find_all(class_='t_fsz')

    # page = parser(url).find(id='fj').find('input')['size']
    # page = parser(url).find_all(class_='.pg')[0].find_all('label')[0].find_all('span')[0].title
    # page = re.findall(r'共 (.+?) 页', page)[0]
    posts = []
    # for i in range(1, int(page) + 1):
    #    posts += [x for x in get_post(url[:-8] + str(i) + '-1.html')]
    content = ''
    posts = get_post(url)
    img_src = r"""\bsrc\b\s*=\s*[\'\"]?([^\'\"]*)[\'\"]?"""
    for post in posts:
        try:
            post.find(class_='pstatus').clear()  # 移除 本贴最后……
            post.find(class_='tip').clear()  # 移除 附件:……
        except AttributeError:
            pass
        post = str(post)
        post = re.sub('<div[^>]*>', '<p>', post)
        post = re.sub('<\/div[^>]*>', '</p>', post)
        post = post.replace('file="', 'src="')  # 懒加载
        for img in re.findall(img_src, post):
            new_img = upload_img(img)
            post = post.replace(img, new_img)
        content += html2markdown(post)
    return content
Example #2
0
def get_posts(url):
    r = parser(url)
    post = str(r.find(class_='article-holder'))
    post = post.replace('data-src', 'src')
    img_src = r"""\bsrc\b\s*=\s*[\'\"]?([^\'\"]*)[\'\"]?"""
    for img in re.findall(img_src, post):
        new_img = upload_img(img)
        post = post.replace(img, new_img)
    post = html2markdown(post)
    return post
Example #3
0
def get_posts(url):
    r = parser(url)
    post = str(r.find(class_='quote-content'))
    img_src = r"""\bsrc\b\s*=\s*[\'\"]?([^\'\"]*)[\'\"]?"""
    post = post.replace('data-original', 'src')
    place_holder = 'https://b1.hoopchina.com.cn/web/sns/bbs/images/placeholder.png'
    for img in re.findall(img_src, post):
        if img == place_holder:  # 懒加载
            post = post.replace(f'src="{place_holder}"', '')
            continue
        try:
            i = img.index('?')
            img_real = img[:i]
        except ValueError:
            img_real = img
        new_img = upload_img(img_real)
        post = post.replace(img, new_img)
    post = html2markdown(post)
    return post
Example #4
0
def get_posts(url):
    def get_post(page_url):
        return parser(page_url).find_all(class_='d_post_content')

    page = parser(url).find_all(
        'li', class_='l_reply_num')[0].find_all('span')[1].text
    posts = []
    for i in range(1, int(page) + 1):
        posts += [x for x in get_post(url + '&pn=' + str(i))]
    content = ''
    img_src = r"""\bsrc\b\s*=\s*[\'\"]?([^\'\"]*)[\'\"]?"""
    for post in posts:
        post = str(post)
        post = re.sub('<div[^>]*>', '<p>', post)
        post = re.sub('<\/div[^>]*>', '</p>', post)
        for img in re.findall(img_src, post):
            new_img = upload_img(img)
            post = post.replace(img, new_img)
        content += html2markdown(post)
    return content
Example #5
0
 def download_img(self, url):
     response = self.session.get(url, headers=self.headers)
     content = response.content
     c_type = response.headers['Content-Type']
     return upload_img(content, "", c_type)