def __extract_url(self, url): 
		"""Extracts the links in the input URL
		"""
		
		import urllib2
		from urllister import URLLister
		from sgmllib import SGMLParseError
		
		req = urllib2.Request(url, headers={'User-Agent' : self.useragent}) 
		try:
			usock = urllib2.urlopen(req)
			parser = URLLister(url)
		
			try:
				parser.feed(usock.read())
				parser.close()
			except Exception as exception:
				if (self.debug > 0): 
					print "sgmllib: Unable to parse web page.\n sgmllib: Raised exception %s"%(type(exception).__name__)
					fd = open(self.outdir+"%s.err"%type(exception).__name__, "a")
					fd.write( "%s\n"%(url))	
					fd.close()
				pass
			usock.close()
			return parser.urls
		except (KeyboardInterrupt, SystemExit):
			raise
		except Exception as exception:
			if (self.debug > 0): 
				print "urllib2: Page does not exist or Malformed web address.\n sgmllib: Raised exception %s"%(type(exception).__name__) 
				fd = open(self.outdir+"%s.err"%type(exception).__name__, "a")
				fd.write( "%s\n"%(url))	
				fd.close()
			return []
Beispiel #2
0
def postPingBacks(newbody, post_url):
    """ Make the pingback call """
    pingbackresults = []
    parser = URLLister()
    parser.feed(newbody)
    parser.close()
    urls = parser.urls
    for url in urls:
        url = str(url)
        result = sendPingback(url, post_url)
        pingbackresults.append((url, result))

    return pingbackresults
    def hasLinkToTarget(self, sourceURI, targetURI):
        sock = urllib.urlopen(sourceURI)
        html = sock.read()
        sock.close()
        # use Mark Pilgrim's URLLister from dive into python, chapter 8
        parser = URLLister()
        parser.feed(html)
        parser.close()

        links = parser.urls
        if targetURI in links:
            return 1
        else:
            return 0