Beispiel #1
0
    def __init__(self, urlstr, debug=0):
        """
		get HTML contents at a given url 'urlstr'
		"""
        self.debug = debug
        self.geturl = geturl.geturl(urlstr)
        if debug:
            print '### DATA size:', len(self.geturl.data)

        self.parser = hparser.hparser(self.geturl.baseurl, debug=1)
        self.parser.feed(self.geturl.data)
        self.parser.close()
        if debug:
            print '### Got :', len(self.parser.data)
            print self.parser.data
        self.parser.analyze()
        print '#' * 50, '\n'
Beispiel #2
0
	def __init__(self, urlstr, debug=0):
		"""
		get HTML contents at a given url 'urlstr'
		"""			
		self.debug = debug
		self.geturl = geturl.geturl(urlstr)
		if debug: 
			print '### DATA size:', len(self.geturl.data)

		self.parser = hparser.hparser(self.geturl.baseurl, debug=1)
		self.parser.feed( self.geturl.data )
		self.parser.close()
		if debug: 
			print '### Got :', len(self.parser.data)
			print self.parser.data
		self.parser.analyze()	
		print '#'*50,'\n'
Beispiel #3
0
	theurl = 'http://m.knpu.org/hoho'

	txdata = None					# txdata is BODY 
	txheaders = client.header.header_table		# txheaders is header
	#assert None

	try:
		# First URL -> CHILD URL Search
		req = urllib2.Request(theurl, txdata, txheaders)
		handle = urllib2.urlopen(req)
		
		data = handle.read()
		print data

		# GET URL 
		parser = hparser.hparser(theurl, debug=1)
		parser.feed(data)
		parser.close()
		#print parser.prn_anchors()
		lists = parser.get_anchors()
		# Make Dictionary URL : DepthLevel 
		visited_list = []
		url_table = {}
		depth = 2
		seq_num = 1
		prefix = 'knpu.org'
		prefix = 'prefixtest.com'

		for homepage in lists:
			url_table[ homepage[0] ] = 1	# First Depth Level
Beispiel #4
0
					for i in req.unredirected_hdrs:
						length = length + len(i) + len(req.unredirected_hdrs[i])
#						print req.unredirected_hdrs
#						print req.header_items()
#						print req.has_header('Cookie')
#						print "=" * 50, 'End'

					data = handle_output.read()

					if handle_output.headers.getheader('Content-Type') != \
						'text/html':	# Content-Type 이 아니라면 Skip
						continue

					#print data
					try:
						parser = hparser.hparser(handle_output.geturl(), debug=1)
						parser.feed(data)
						parser.close()
						lists = parser.get_anchors() # lists 에 body의 URL 저장
					except: 
						pass


					for homepage in lists:

						if urlparse.urlparse(homepage[0]) [0] != 'http':
							print urlparse.urlparse(homepage[0]) [0] 
							continue
						url_table[ homepage[0] ] = current_depth + 1

						#print 'DEBUG 4444'