def __extract_url(self, url): """Extracts the links in the input URL """ import urllib2 from urllister import URLLister from sgmllib import SGMLParseError req = urllib2.Request(url, headers={'User-Agent' : self.useragent}) try: usock = urllib2.urlopen(req) parser = URLLister(url) try: parser.feed(usock.read()) parser.close() except Exception as exception: if (self.debug > 0): print "sgmllib: Unable to parse web page.\n sgmllib: Raised exception %s"%(type(exception).__name__) fd = open(self.outdir+"%s.err"%type(exception).__name__, "a") fd.write( "%s\n"%(url)) fd.close() pass usock.close() return parser.urls except (KeyboardInterrupt, SystemExit): raise except Exception as exception: if (self.debug > 0): print "urllib2: Page does not exist or Malformed web address.\n sgmllib: Raised exception %s"%(type(exception).__name__) fd = open(self.outdir+"%s.err"%type(exception).__name__, "a") fd.write( "%s\n"%(url)) fd.close() return []
def __init__( self, limit ): self.mutex = threading.Lock() self.map = dict() self.limit = limit self.userLister = UserLister() self.urlLister = URLLister() self.titleFetcher = None self.report = True
def postPingBacks(newbody, post_url): """ Make the pingback call """ pingbackresults = [] parser = URLLister() parser.feed(newbody) parser.close() urls = parser.urls for url in urls: url = str(url) result = sendPingback(url, post_url) pingbackresults.append((url, result)) return pingbackresults
def hasLinkToTarget(self, sourceURI, targetURI): sock = urllib.urlopen(sourceURI) html = sock.read() sock.close() # use Mark Pilgrim's URLLister from dive into python, chapter 8 parser = URLLister() parser.feed(html) parser.close() links = parser.urls if targetURI in links: return 1 else: return 0
def callback(response): if response.error: print 'Error', response.error, url else: data = response.body lister = URLLister() lister.feed(data) urls = lister.getUrls() imgs = lister.getImgs() self._imgs.extend(imgs) for newUrl in urls: self._crawl(newUrl, depth - 1)
def _do(self, entry, action): """ Returns 1 or 0. """ args = action.split("=") if len(args) != 2: s = "Not a valid argument for filtering. Check your query." self.logger.error(s) raise TypeError(s) # link_ends if args[0] == "link_ends": lister = URLLister() lister.feed(entry.content) lister.close() found = 0 for url in lister.urls: if url.endswith(args[1]): found = 1 entry.links = url return found # whitelist=true | blacklist=true elif (args[0] == "whitelist" or args[0] == "blacklist") and args[1] == "true": lister = URLLister() lister.feed(entry.content) lister.close() if args[0] == "blacklist": found = 1 else: found = 0 for url in lister.urls: l = None if args[0] == "whitelist": l = settings.LEOLO_WHITELIST elif args[0] == "blacklist": l = settings.LEOLO_BLACKLIST for domain in l: # urlparse()[1] extracts domain if urlparse(url)[1].endswith(domain): if args[0] == "blacklist": found = 0 else: found = 1 entry.links = url elif not urlparse(url)[1].endswith(domain) and args[0] == "blacklist": entry.links = url enclosures = [] for url in entry.enclosures_cp: l = None if args[0] == "whitelist": l = settings.LEOLO_WHITELIST elif args[0] == "blacklist": l = settings.LEOLO_BLACKLIST for domain in l: # urlparse()[1] extracts domain if urlparse(url)[1].endswith(domain): if args[0] == "blacklist": found = 0 else: found = 1 enclosures.append(url) elif not urlparse(url)[1].endswith(domain) and args[0] == "blacklist": enclosures.append(url) entry.enclosures = enclosures return found # enclosure_ends elif args[0] == "enclosure_ends": enclosures = [] found = 0 for enc in entry.enclosures_cp: if enc.endswith(args[1]): found = 1 enclosures.append(enc) entry.enclosures = enclosures return found # embed_ends elif args[0] == "embed_ends": soup = BeautifulSoup(entry.content) all_params = soup.findAll("param") url_pat = re.compile(r"http.[^\"]*" + str(args[1])) found = 0 links = [] for a in all_params: k = url_pat.findall(str(a)) if k: link = k[0] if link.startswith("http%3A%2F%2F"): link = link.replace("%3A", ":") link = link.replace("%2F", "/") entry.links = link found = 1 return found # error else: s = "'%s' is not a valid argument for filtering. Check " "your query." % (args[0]) self.logger.error(s) raise TypeError(s) return 0
class Users( object ): class Error( Exception ): def __init__( self, value ): self.value = value def __str__( self ): return repr( self.value ) def __init__( self, limit ): self.mutex = threading.Lock() self.map = dict() self.limit = limit self.userLister = UserLister() self.urlLister = URLLister() self.titleFetcher = None self.report = True def setFetcher( self, titleFetcher ): self.titleFetcher = titleFetcher def users( self ): try: self.mutex.acquire() return self.map.keys() finally: self.mutex.release() def disconnectAll( self ): self.mutex.acquire() for connector in self.map.itervalues(): connector.disconnect() self.mutex.release() def addUser( self, name, handler ): self.mutex.acquire() try: if len( self.map ) == self.limit: raise UserMap.Error( 'maximum connections limit reached' ) if name in self.map or name in [ '[SRV]', '[TitleFetcher]' ]: raise UserMap.Error( 'chosen name is already used' ) self.map[ name ] = handler if self.report: self._messageAll( '[SRV]', [ name ], 'User {0} has joined.\n'.format( name ) ) finally: self.mutex.release() def removeUser( self, name ): self.mutex.acquire() if name in self.map: del self.map[ name ] if self.report: self._messageAll( '[SRV]', [ name ], 'User {0} has quit.\n'.format( name ) ) self.mutex.release() def message( self, name, text ): self.mutex.acquire() users = self.userLister.get( text ) if len( users ) > 0: self._messageUsers( name, users, text ) users.append( name ) else: self._messageAll( name, [ name ], text ) users = self.map.keys() if self.titleFetcher is not None: for url in self.urlLister.get( text ): self.titleFetcher.fetch( url, users ) self.mutex.release() def sendTitle( self, recipients, link ): self.mutex.acquire() msg = '[TitleFetcher]: {0}\n'.format( link ) log( msg ) for r in recipients: if self._canIgnore( r ): continue self.map[ r ].write_data( msg ) self.mutex.release() def _messageAll( self, who, exclude, text ): msg = '{0}: {1}'.format( who, text ) log( msg ) for _n, item in self.map.iteritems(): if _n in exclude: continue item.write_data( msg ) def _messageUsers( self, who, recipients, text ): msg = '{0}: {1}'.format( who, text ) log( msg ) for r in recipients: if r == who: continue if self._canIgnore( r ): continue self.map[ r ].write_data( msg ) def _canIgnore( self, name ): name = name.lower() for key in self.map.keys(): if key.lower() == name.lower(): return False return True
from urllib import urlopen from urllister import URLLister s = urlopen('http://igm.univ-mlv.fr/~jyt').read() p = URLLister() p.feed(s) p.close() print p.urls