def __extract_url(self, url): 
		"""Extracts the links in the input URL
		"""
		
		import urllib2
		from urllister import URLLister
		from sgmllib import SGMLParseError
		
		req = urllib2.Request(url, headers={'User-Agent' : self.useragent}) 
		try:
			usock = urllib2.urlopen(req)
			parser = URLLister(url)
		
			try:
				parser.feed(usock.read())
				parser.close()
			except Exception as exception:
				if (self.debug > 0): 
					print "sgmllib: Unable to parse web page.\n sgmllib: Raised exception %s"%(type(exception).__name__)
					fd = open(self.outdir+"%s.err"%type(exception).__name__, "a")
					fd.write( "%s\n"%(url))	
					fd.close()
				pass
			usock.close()
			return parser.urls
		except (KeyboardInterrupt, SystemExit):
			raise
		except Exception as exception:
			if (self.debug > 0): 
				print "urllib2: Page does not exist or Malformed web address.\n sgmllib: Raised exception %s"%(type(exception).__name__) 
				fd = open(self.outdir+"%s.err"%type(exception).__name__, "a")
				fd.write( "%s\n"%(url))	
				fd.close()
			return []
Beispiel #2
0
def postPingBacks(newbody, post_url):
    """ Make the pingback call """
    pingbackresults = []
    parser = URLLister()
    parser.feed(newbody)
    parser.close()
    urls = parser.urls
    for url in urls:
        url = str(url)
        result = sendPingback(url, post_url)
        pingbackresults.append((url, result))

    return pingbackresults
    def hasLinkToTarget(self, sourceURI, targetURI):
        sock = urllib.urlopen(sourceURI)
        html = sock.read()
        sock.close()
        # use Mark Pilgrim's URLLister from dive into python, chapter 8
        parser = URLLister()
        parser.feed(html)
        parser.close()

        links = parser.urls
        if targetURI in links:
            return 1
        else:
            return 0
Beispiel #4
0
    def _do(self, entry, action):
        """
        Returns 1 or 0.
        """
        args = action.split("=")
        if len(args) != 2:
            s = "Not a valid argument for filtering. Check your query."
            self.logger.error(s)
            raise TypeError(s)

        # link_ends
        if args[0] == "link_ends":
            lister = URLLister()
            lister.feed(entry.content)
            lister.close()
            found = 0
            for url in lister.urls:
                if url.endswith(args[1]):
                    found = 1
                    entry.links = url
            return found

        # whitelist=true | blacklist=true
        elif (args[0] == "whitelist" or args[0] == "blacklist") and args[1] == "true":
            lister = URLLister()
            lister.feed(entry.content)
            lister.close()
            if args[0] == "blacklist":
                found = 1
            else:
                found = 0
            for url in lister.urls:
                l = None
                if args[0] == "whitelist":
                    l = settings.LEOLO_WHITELIST
                elif args[0] == "blacklist":
                    l = settings.LEOLO_BLACKLIST
                for domain in l:
                    # urlparse()[1] extracts domain
                    if urlparse(url)[1].endswith(domain):
                        if args[0] == "blacklist":
                            found = 0
                        else:
                            found = 1
                            entry.links = url
                    elif not urlparse(url)[1].endswith(domain) and args[0] == "blacklist":
                        entry.links = url

            enclosures = []
            for url in entry.enclosures_cp:
                l = None
                if args[0] == "whitelist":
                    l = settings.LEOLO_WHITELIST
                elif args[0] == "blacklist":
                    l = settings.LEOLO_BLACKLIST
                for domain in l:
                    # urlparse()[1] extracts domain
                    if urlparse(url)[1].endswith(domain):
                        if args[0] == "blacklist":
                            found = 0
                        else:
                            found = 1
                            enclosures.append(url)
                    elif not urlparse(url)[1].endswith(domain) and args[0] == "blacklist":
                        enclosures.append(url)
            entry.enclosures = enclosures

            return found

        # enclosure_ends
        elif args[0] == "enclosure_ends":
            enclosures = []
            found = 0
            for enc in entry.enclosures_cp:
                if enc.endswith(args[1]):
                    found = 1
                    enclosures.append(enc)
            entry.enclosures = enclosures
            return found

        # embed_ends
        elif args[0] == "embed_ends":
            soup = BeautifulSoup(entry.content)
            all_params = soup.findAll("param")
            url_pat = re.compile(r"http.[^\"]*" + str(args[1]))
            found = 0
            links = []
            for a in all_params:
                k = url_pat.findall(str(a))
                if k:
                    link = k[0]
                    if link.startswith("http%3A%2F%2F"):
                        link = link.replace("%3A", ":")
                        link = link.replace("%2F", "/")
                    entry.links = link
                    found = 1
            return found

        # error
        else:
            s = "'%s' is not a valid argument for filtering. Check " "your query." % (args[0])
            self.logger.error(s)
            raise TypeError(s)

        return 0
Beispiel #5
0
from urllib import urlopen
from urllister import URLLister
s = urlopen('http://igm.univ-mlv.fr/~jyt').read()
p = URLLister()
p.feed(s)
p.close()
print p.urls