def __extract_url(self, url): """Extracts the links in the input URL """ import urllib2 from urllister import URLLister from sgmllib import SGMLParseError req = urllib2.Request(url, headers={'User-Agent' : self.useragent}) try: usock = urllib2.urlopen(req) parser = URLLister(url) try: parser.feed(usock.read()) parser.close() except Exception as exception: if (self.debug > 0): print "sgmllib: Unable to parse web page.\n sgmllib: Raised exception %s"%(type(exception).__name__) fd = open(self.outdir+"%s.err"%type(exception).__name__, "a") fd.write( "%s\n"%(url)) fd.close() pass usock.close() return parser.urls except (KeyboardInterrupt, SystemExit): raise except Exception as exception: if (self.debug > 0): print "urllib2: Page does not exist or Malformed web address.\n sgmllib: Raised exception %s"%(type(exception).__name__) fd = open(self.outdir+"%s.err"%type(exception).__name__, "a") fd.write( "%s\n"%(url)) fd.close() return []
def postPingBacks(newbody, post_url): """ Make the pingback call """ pingbackresults = [] parser = URLLister() parser.feed(newbody) parser.close() urls = parser.urls for url in urls: url = str(url) result = sendPingback(url, post_url) pingbackresults.append((url, result)) return pingbackresults
def hasLinkToTarget(self, sourceURI, targetURI): sock = urllib.urlopen(sourceURI) html = sock.read() sock.close() # use Mark Pilgrim's URLLister from dive into python, chapter 8 parser = URLLister() parser.feed(html) parser.close() links = parser.urls if targetURI in links: return 1 else: return 0
def _do(self, entry, action): """ Returns 1 or 0. """ args = action.split("=") if len(args) != 2: s = "Not a valid argument for filtering. Check your query." self.logger.error(s) raise TypeError(s) # link_ends if args[0] == "link_ends": lister = URLLister() lister.feed(entry.content) lister.close() found = 0 for url in lister.urls: if url.endswith(args[1]): found = 1 entry.links = url return found # whitelist=true | blacklist=true elif (args[0] == "whitelist" or args[0] == "blacklist") and args[1] == "true": lister = URLLister() lister.feed(entry.content) lister.close() if args[0] == "blacklist": found = 1 else: found = 0 for url in lister.urls: l = None if args[0] == "whitelist": l = settings.LEOLO_WHITELIST elif args[0] == "blacklist": l = settings.LEOLO_BLACKLIST for domain in l: # urlparse()[1] extracts domain if urlparse(url)[1].endswith(domain): if args[0] == "blacklist": found = 0 else: found = 1 entry.links = url elif not urlparse(url)[1].endswith(domain) and args[0] == "blacklist": entry.links = url enclosures = [] for url in entry.enclosures_cp: l = None if args[0] == "whitelist": l = settings.LEOLO_WHITELIST elif args[0] == "blacklist": l = settings.LEOLO_BLACKLIST for domain in l: # urlparse()[1] extracts domain if urlparse(url)[1].endswith(domain): if args[0] == "blacklist": found = 0 else: found = 1 enclosures.append(url) elif not urlparse(url)[1].endswith(domain) and args[0] == "blacklist": enclosures.append(url) entry.enclosures = enclosures return found # enclosure_ends elif args[0] == "enclosure_ends": enclosures = [] found = 0 for enc in entry.enclosures_cp: if enc.endswith(args[1]): found = 1 enclosures.append(enc) entry.enclosures = enclosures return found # embed_ends elif args[0] == "embed_ends": soup = BeautifulSoup(entry.content) all_params = soup.findAll("param") url_pat = re.compile(r"http.[^\"]*" + str(args[1])) found = 0 links = [] for a in all_params: k = url_pat.findall(str(a)) if k: link = k[0] if link.startswith("http%3A%2F%2F"): link = link.replace("%3A", ":") link = link.replace("%2F", "/") entry.links = link found = 1 return found # error else: s = "'%s' is not a valid argument for filtering. Check " "your query." % (args[0]) self.logger.error(s) raise TypeError(s) return 0
from urllib import urlopen from urllister import URLLister s = urlopen('http://igm.univ-mlv.fr/~jyt').read() p = URLLister() p.feed(s) p.close() print p.urls