def __init__(self, url, config=None): """ Define a WebSite. """ self.config = config self.webData = None self.crawl = False self.domainBase = None self.max_page_count = 1 if config is not None: self.crawl = self.config.has_crawl() self.domainBase = self.config.get_domainbase() self.max_page_count = self.config.get_max_page_count() self.get_hrefs = KeeperReg("href\s*=\s*[^ <>]*[a-zA-Z0-9]") self.toProcess = [url] self.max_page_count -= 1 self.processed = [] self.iterate_pages()
class WebSite(object): """ Class representing a website. """ def __init__(self, url, config=None): """ Define a WebSite. """ self.config = config self.webData = None self.crawl = False self.domainBase = None self.max_page_count = 1 if config is not None: self.crawl = self.config.has_crawl() self.domainBase = self.config.get_domainbase() self.max_page_count = self.config.get_max_page_count() self.get_hrefs = KeeperReg("href\s*=\s*[^ <>]*[a-zA-Z0-9]") self.toProcess = [url] self.max_page_count -= 1 self.processed = [] self.iterate_pages() def get_next(self): """ @return: The next URL in the list. """ if len(self.toProcess) > 0: out = self.toProcess.pop() self.processed.append(out) return out return None def get_urls(self, data): """ Process the page looking for URLs to traverse @param data: The raw webpage text. """ if self.get_hrefs is not None and data is not None: for url in self.get_hrefs.keep(data): # This is removing junk from the string. It needs to be # cleaner. #print "Start: %s" % url url = url.replace("href", "") url = url.replace("=", "") url = url.replace("\"", "") url = url.replace('\\','') url = url.strip() parsed = urlparse.urlparse(url) url = ("http" if len(parsed.scheme) == 0 else parsed.scheme) + \ "://" + parsed.netloc + parsed.path if url not in self.toProcess and \ url not in self.processed and \ self.max_page_count > 0: if self.domainBase is not None: if url.rfind(self.domainBase) != -1 and \ url.rfind("@%s" % self.domainBase) == -1: self.toProcess.append(url) self.max_page_count -= 1 else: self.toProcess.append(url) self.max_page_count -= 1 def iterate_pages(self): """ Iterate through the list of URLS available for use. """ url = self.get_next() logger.debug("iterating over %s" % url) while url is not None: logger.debug("toProcess: %s" % self.toProcess) logger.debug("Processed: %s" % self.processed) logger.debug("processing: %s" % url) #self.processed.append(url) if isinstance(url, str): logger.debug("url is a string") self.url = url (data, content_type) = self.retrieve_data() else: self.url = None data = "" if self.crawl: self.get_urls(data) self.process_data(data, content_type) url = self.get_next() def retrieve_data(self): """ Retrieve the data from the website. """ w = WebGrabber(self.url) w.get_page() if w.done(): return (w.get_data(), w.get_content_type()) else: logger.warning("Using empty string as WebSite data") return ("", None) def process_data(self, html, content_type): """ Do the web grab from the website and all the filtering and searching. """ #### TO DO add processing for other content types self.process_as_text(html) def process_as_text(self, html): """ Process the text data received. """ logger.debug("processing:\n%s..." % (html[:25] or "Nothing")) (search, scrub, reverse) = self.generate_filters() logger.debug("Search: %s" % search) logger.debug("Scrub: %s" % scrub) logger.debug("reverse: %s" % reverse) if reverse == 0: if scrub is not None: logger.debug("Scrubbing data") html = scrub.scrub(html) logger.debug("New size: %d" % len(html)) if search is not None: logger.debug("Searching data") logger.debug("Working this %s" % type(html)) html = search.keep(html) logger.debug(html) else: if search is not None: logger.debug("Searching data") html = search.keep(html) if scrub is not None: logger.debug("Scrubbing data") html = scrub.scrub(html) if not self.crawl or self.webData is None: self.webData = html elif isinstance(self.webData, str): self.webData += "\n\n####\n\n" + html elif isinstance(self.webData, set): self.webData.append(html) #print "-------" #print self.webData[:100] def generate_filters(self): """ Generate the list of filters from the configutation. """ if self.config == None: return (None, None, 0) if self.config.has_search(): search = KeeperReg(self.config.search_string) else: search = None if self.config.has_filter(): scrub = ScrubReg(self.config.filter_string) else: scrub = None return (search, scrub, self.config.get_reverse()) def get_data(self): """ @return: The data from the web page """ #if len(self.webData) == 1: # return self.webData[0] if self.webData is None: return "" return self.webData def output_file(self, name): """ generate the output file with the user specified name. @param name: The file name to be used """ if os.path.exists(name): count = 1 tmp = "%s_%d" % (name, count) while os.path.exists(tmp): count += 1 tmp = "%s_%d" % (name,count) logger.warning(("unable to use '%s' as an output " + "using '%s' instead.") % (name, tmp)) name = tmp output = open(name, "w") #print self.get_data() if isinstance(self.get_data(), list): for l in self.get_data(): output.write("%s\n" % l) else: output.write(str(self.get_data())) output.close()