def __init__(self): self.crawler = GetHTMLPage() self.crawler.set_headers(( ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \ ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') )) self.mime_handler = MIMEHandler() # define wanted/unwanted file types self.wanted_mimes = [ 'application/pdf', 'application/msword', 'text/rtf' 'application/postscript', 'octet/stream', 'application/vnd.oasis.opendocument.text' ] self.unwanted_mimes = [ 'application/zip', 'application/x-tar', 'application/x-gtar' ]
def __init__(self): self.crawler = GetHTMLPage() self.crawler.set_headers(( ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \ ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') )) self.mime_handler = MIMEHandler() # define wanted/unwanted file types self.wanted_mimes = ['application/pdf','application/msword', 'text/rtf' 'application/postscript', 'octet/stream', 'application/vnd.oasis.opendocument.text'] self.unwanted_mimes = ['application/zip','application/x-tar', 'application/x-gtar']
class GetHTMLAndParse: # init like init def __init__(self): self.crawler = GetHTMLPage() self.crawler.set_headers(( ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \ ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') )) self.mime_handler = MIMEHandler() # define wanted/unwanted file types self.wanted_mimes = ['application/pdf','application/msword', 'text/rtf' 'application/postscript', 'octet/stream', 'application/vnd.oasis.opendocument.text'] self.unwanted_mimes = ['application/zip','application/x-tar', 'application/x-gtar'] """ Get Html And Parse page identified by url and remember it """ def ghap(self, url): self._current_tree = None ############ # open URL # try: _res = self.crawler.get_page(url) if _res[0] == -1: self._current_tree = -1 return (-1,_res[1]) else: self._current_tree = self.crawler.get_etree() except: return (-1, 'Downloading page interrupted.') # successful return return (1, 'OK') def is_wanted_mime(self,link): "Test if mime type of link is in wanted types for deliverables documents" res = self.get_content_type(link) if not res: return False if res in self.wanted_mimes: return True else: return False def is_unwanted_mime(self,link): res = self.get_content_type(link) if not res: return True if res in self.unwanted_mimes: return True else: return False def is_page(self,link): res = self.get_content_type(link) if res == "text/html": return True else: return False """ Returns MIME type of content """ def get_content_type(self, url=None): # returns MIME type of current page in GHAP if parameter url is None if url == None: return False res = self.mime_handler.start([url]) if res == None: print "Chyba pri zistovani mime" return False else: return res[url] """ Compare two domain names from their URLs""" def compare_domains(self, right, left): rsplit = urlsplit(right) lsplit = urlsplit(left) # now we have two tuples of parsed URLs if re.match("(wiki\.|www\.)?" + rsplit[1], lsplit[1], re.I): return 1 else: return 0 """ Simple get domain name from URL """ def get_domain_name(self, url): try: # use urlsplit function return urlsplit(url)[1] except: return None """ get, filter, edit anchors and return URLs if parameter regul is not None, returns URLs only from anchors that matches for REGEXP in regul if parameter base is not None, makes absolute URLs as mixure of base and link from anchor's href atribute """ def get_all_links(self, regul=None, base=None): # get all anchors links = self._current_tree.findall('.//a[@href]') final = [] for link in links: # all atributes and text together try: texts = link.text_content() + " " + " ".join(link.values()) except: return list() # make links absolute if base is not None: link.make_links_absolute(base) # search in links if regul is not None: if regul.search(texts): # regul matches final.append(link.get('href')) # get URL else: final.append(link.get('href')) return list(set(final)) # my little uniq """ Helper method for searching pagers """ def get_pager_links(self, base=None): # get all anchors with href attribute links = self._current_tree.findall('.//a[@href]') final = [] for link in links: text = lh.tostring(link, method='text', encoding=unicode) if base is not None: # make links absolute link.make_links_absolute(base) # search pager pattern if re.search('(^ ?[0-9]+ ?$)|(next)', text, re.I): final.append(link.get('href')) # get URL return list(set(final)) # my little uniq """ Get, filter and count header titles on one page """ def count_all_headers(self, regul=None): # look for only first 3 levels of headers try: heads = self._current_tree.findall('//h1') heads.extend(self._current_tree.findall('//h2')) heads.extend(self._current_tree.findall('//h3')) except AssertionError: return 0 final = [] # search in headers if regul is not None: for head in heads: try: if regul.search(head.text_content()): # regul matches final.append(head.text_content()) # get URL except UnicodeDecodeError, err: return (-1, 'Unicode decode error.') else:
class GetHTMLAndParse: # init like init def __init__(self): self.crawler = GetHTMLPage() self.crawler.set_headers(( ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \ ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') )) self.mime_handler = MIMEHandler() # define wanted/unwanted file types self.wanted_mimes = [ 'application/pdf', 'application/msword', 'text/rtf' 'application/postscript', 'octet/stream', 'application/vnd.oasis.opendocument.text' ] self.unwanted_mimes = [ 'application/zip', 'application/x-tar', 'application/x-gtar' ] """ Get Html And Parse page identified by url and remember it """ def ghap(self, url): self._current_tree = None ############ # open URL # try: _res = self.crawler.get_page(url) if _res[0] == -1: self._current_tree = -1 return (-1, _res[1]) else: self._current_tree = self.crawler.get_etree() except: return (-1, 'Downloading page interrupted.') # successful return return (1, 'OK') def is_wanted_mime(self, link): "Test if mime type of link is in wanted types for deliverables documents" res = self.get_content_type(link) if not res: return False if res in self.wanted_mimes: return True else: return False def is_unwanted_mime(self, link): res = self.get_content_type(link) if not res: return True if res in self.unwanted_mimes: return True else: return False def is_page(self, link): res = self.get_content_type(link) if res == "text/html": return True else: return False """ Returns MIME type of content """ def get_content_type(self, url=None): # returns MIME type of current page in GHAP if parameter url is None if url == None: return False res = self.mime_handler.start([url]) if res == None: print "Chyba pri zistovani mime" return False else: return res[url] """ Compare two domain names from their URLs""" def compare_domains(self, right, left): rsplit = urlsplit(right) lsplit = urlsplit(left) # now we have two tuples of parsed URLs if re.match("(wiki\.|www\.)?" + rsplit[1], lsplit[1], re.I): return 1 else: return 0 """ Simple get domain name from URL """ def get_domain_name(self, url): try: # use urlsplit function return urlsplit(url)[1] except: return None """ get, filter, edit anchors and return URLs if parameter regul is not None, returns URLs only from anchors that matches for REGEXP in regul if parameter base is not None, makes absolute URLs as mixure of base and link from anchor's href atribute """ def get_all_links(self, regul=None, base=None): # get all anchors links = self._current_tree.findall('.//a[@href]') final = [] for link in links: # all atributes and text together try: texts = link.text_content() + " " + " ".join(link.values()) except: return list() # make links absolute if base is not None: link.make_links_absolute(base) # search in links if regul is not None: if regul.search(texts): # regul matches final.append(link.get('href')) # get URL else: final.append(link.get('href')) return list(set(final)) # my little uniq """ Helper method for searching pagers """ def get_pager_links(self, base=None): # get all anchors with href attribute links = self._current_tree.findall('.//a[@href]') final = [] for link in links: text = lh.tostring(link, method='text', encoding=unicode) if base is not None: # make links absolute link.make_links_absolute(base) # search pager pattern if re.search('(^ ?[0-9]+ ?$)|(next)', text, re.I): final.append(link.get('href')) # get URL return list(set(final)) # my little uniq """ Get, filter and count header titles on one page """ def count_all_headers(self, regul=None): # look for only first 3 levels of headers try: heads = self._current_tree.findall('//h1') heads.extend(self._current_tree.findall('//h2')) heads.extend(self._current_tree.findall('//h3')) except AssertionError: return 0 final = [] # search in headers if regul is not None: for head in heads: try: if regul.search(head.text_content()): # regul matches final.append(head.text_content()) # get URL except UnicodeDecodeError, err: return (-1, 'Unicode decode error.') else: