Python Document.replace Examples

Programming Language: Python

Namespace/Package Name: readability

Class/Type: Document

Method/Function: replace

Examples at hotexamples.com: 2

Python Document.replace - 2 examples found. These are the top rated real world Python examples of readability.Document.replace extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Document(30)

summary(30)

title(30)

short_title(19)

content(13)

get_clean_html(3)

author(2)

encode(2)

get_clean_article(2)

parse(2)

replace(2)

score_paragraphs(2)

_html(1)

markdown(1)

split(1)

Example #1

Show file

def main(argv):
    outputfile = ''
    inputurl = ''
    try:
        opts, args = getopt.getopt(argv, "o:i:", ["output=", "input="])
    except:
        return

    for opt, arg in opts:
        if opt == '-o':
            outputfile = arg
        if opt == '-i':
            inputurl = arg

    try:
        res = requests.get(inputurl, headers=GENERAL_HEADERS)
    except:
        pass

    # 获取降噪内容
    text = Document(res.text).summary().encode('utf-8')

    soup = BeautifulSoup(text, "lxml")
    imgs = soup.find_all('img')

    # 遍历下载所有图片
    for i in imgs:
        img_link = i.attrs['src']
        extension = get_extension(img_link)
        try:
            r = requests.get(i.attrs['src'])
        except:
            pass
        if extension != None:
            # 将原文中的图片以base64替换之
            text = text.replace(
                i.attrs['src'], "data:image/%s;base64,%s" %
                (extension, base64.b64encode(r.content)))

    # 输出文件
    f = open(outputfile, 'w')
    f.write(
        '<!DOCTYPE html><html><head><meta charset="UTF-8"><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><title>Document</title></head><body>'
    )
    f.write(text)
    f.write('</body></html>')
    f.close()

Example #2

Show file

File: baidu_news.py Project: zchasd/scrapy_tests

    file = open('baidu_result\\' + str(no) + '.txt', 'w')
    file.write(title + '\n' + http + '\n' + article)


if __name__ == "__main__":
    httplist = list()
    clearpath()  # 清空文件夹
    i = 0
    for httplist_ in gethttp(5000):  #从百度新闻上爬取新闻链接
        for http in httplist_:
            print str(i), ': ', http
            article = '1'
            try:
                req = urllib2.Request(http, headers=agent)
                html = urllib2.urlopen(req)
                html = html.read()
                article = Document(html).summary()  #提取正文
                title = Document(html).short_title()  #提取标题
                html = str(BeautifulSoup(html, "html.parser"))
                dr = re.compile(r'<[^>]+>')  #定义正则
                article = dr.sub('', article)  #去除html标签
                article = article.replace(' ', '')  #去除空格
                article = article.replace('\n', '')  #去除换行
            except Exception, e:
                title = http
                article = 'HTTPError'

            print title  #标题打印到屏幕上
            writefile(i, title, article, http)  #创建txt，写入
            i += 1