Example #1
0
def main(argv):
    outputfile = ''
    inputurl = ''
    try:
        opts, args = getopt.getopt(argv, "o:i:", ["output=", "input="])
    except:
        return

    for opt, arg in opts:
        if opt == '-o':
            outputfile = arg
        if opt == '-i':
            inputurl = arg

    try:
        res = requests.get(inputurl, headers=GENERAL_HEADERS)
    except:
        pass

    # 获取降噪内容
    text = Document(res.text).summary().encode('utf-8')

    soup = BeautifulSoup(text, "lxml")
    imgs = soup.find_all('img')

    # 遍历下载所有图片
    for i in imgs:
        img_link = i.attrs['src']
        extension = get_extension(img_link)
        try:
            r = requests.get(i.attrs['src'])
        except:
            pass
        if extension != None:
            # 将原文中的图片以base64替换之
            text = text.replace(
                i.attrs['src'], "data:image/%s;base64,%s" %
                (extension, base64.b64encode(r.content)))

    # 输出文件
    f = open(outputfile, 'w')
    f.write(
        '<!DOCTYPE html><html><head><meta charset="UTF-8"><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><title>Document</title></head><body>'
    )
    f.write(text)
    f.write('</body></html>')
    f.close()
Example #2
0
    file = open('baidu_result\\' + str(no) + '.txt', 'w')
    file.write(title + '\n' + http + '\n' + article)


if __name__ == "__main__":
    httplist = list()
    clearpath()  # 清空文件夹
    i = 0
    for httplist_ in gethttp(5000):  #从百度新闻上爬取新闻链接
        for http in httplist_:
            print str(i), ': ', http
            article = '1'
            try:
                req = urllib2.Request(http, headers=agent)
                html = urllib2.urlopen(req)
                html = html.read()
                article = Document(html).summary()  #提取正文
                title = Document(html).short_title()  #提取标题
                html = str(BeautifulSoup(html, "html.parser"))
                dr = re.compile(r'<[^>]+>')  #定义正则
                article = dr.sub('', article)  #去除html标签
                article = article.replace(' ', '')  #去除空格
                article = article.replace('\n', '')  #去除换行
            except Exception, e:
                title = http
                article = 'HTTPError'

            print title  #标题打印到屏幕上
            writefile(i, title, article, http)  #创建txt,写入
            i += 1