def main(argv): outputfile = '' inputurl = '' try: opts, args = getopt.getopt(argv, "o:i:", ["output=", "input="]) except: return for opt, arg in opts: if opt == '-o': outputfile = arg if opt == '-i': inputurl = arg try: res = requests.get(inputurl, headers=GENERAL_HEADERS) except: pass # 获取降噪内容 text = Document(res.text).summary().encode('utf-8') soup = BeautifulSoup(text, "lxml") imgs = soup.find_all('img') # 遍历下载所有图片 for i in imgs: img_link = i.attrs['src'] extension = get_extension(img_link) try: r = requests.get(i.attrs['src']) except: pass if extension != None: # 将原文中的图片以base64替换之 text = text.replace( i.attrs['src'], "data:image/%s;base64,%s" % (extension, base64.b64encode(r.content))) # 输出文件 f = open(outputfile, 'w') f.write( '<!DOCTYPE html><html><head><meta charset="UTF-8"><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><title>Document</title></head><body>' ) f.write(text) f.write('</body></html>') f.close()
file = open('baidu_result\\' + str(no) + '.txt', 'w') file.write(title + '\n' + http + '\n' + article) if __name__ == "__main__": httplist = list() clearpath() # 清空文件夹 i = 0 for httplist_ in gethttp(5000): #从百度新闻上爬取新闻链接 for http in httplist_: print str(i), ': ', http article = '1' try: req = urllib2.Request(http, headers=agent) html = urllib2.urlopen(req) html = html.read() article = Document(html).summary() #提取正文 title = Document(html).short_title() #提取标题 html = str(BeautifulSoup(html, "html.parser")) dr = re.compile(r'<[^>]+>') #定义正则 article = dr.sub('', article) #去除html标签 article = article.replace(' ', '') #去除空格 article = article.replace('\n', '') #去除换行 except Exception, e: title = http article = 'HTTPError' print title #标题打印到屏幕上 writefile(i, title, article, http) #创建txt,写入 i += 1