Beispiel #1
0
#! /usr/bin/python2 
# -*- encoding: utf-8 -*-
# 유닉스의 경우에,  cgi 스크립트를 실행하기 위해서는 현재 파일을 chmod +x 로 실행가능비트로 지정하고 #! /usr/bin/python2와 같이 경로를 지정한다. 
# windows에서는 이런게 필요없다.
# python 2.4.3
# Beautiful Soup (2.1.1)
import urllib
from BeautifulSoup import BeautifulSoup
#from bs4 import BeautifulSoup

html_source = urllib.urlopen('http://www.naver.com').read()
soup = BeautifulSoup(html_source, fromEncoding="utf-8")

for link in soup.findAll('a'):
#    print(link.get('href'))
	print soup.get_text()
Beispiel #2
0
	################## Pre-Processing Of Text ###############################

	# data = soup.findAll(text=True)
	# [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
	# visible_text = soup.getText()

	
	
	
		# kill all script and style elements
	for script in soup(["script", "style","title","head","[document]"]):
	    script.extract()    # rip it out

	# get text
	text = soup.get_text()

	# break into lines and remove leading and trailing space on each
	lines = (line.strip() for line in text.splitlines())
	# break multi-headlines into a line each
	chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
	# drop blank lines
	text = '\n'.join(chunk for chunk in chunks if chunk)

	visible_text=(text.encode('utf-8'))

	FewText=visible_text[2500:3000]


	for  words in SearchWords :