Example #1
0
 def getContentNormalisedURLList(self):
   """
   Call url normalizer for each url returned by getContentURLList
   Return only url associated to the same Domain
   """
   reference_domain = urlsplit(normaliseUrl(self.asURL() or ''))[1]
   # in www.example.com or www.3.example.com
   # keep only the example.com part
   reference_domain = ''.join(reference_domain.split('.')[-2:])
   if isinstance(reference_domain, unicode):
     reference_domain = reference_domain.encode('utf-8')
   url_list = []
   base_url = self.getContentBaseURL()
   for url in self.getContentURLList():
     try:
       url = normaliseUrl(url, base_url=base_url)
     except UnicodeDecodeError:
       # Ignore wrong encoding errors
       # Web is not a kind world
       continue
     if not url:
       continue
     url_domain = urlsplit(url)[1]
     if isinstance(url_domain, unicode):
       url_domain = url_domain.encode('utf-8')
     if url_domain and ''.join(url_domain.split('.')[-2:]) != reference_domain:
       continue
     # if domain is empty (relative link) or domain is same, then OK
     url_list.append(url)
   return url_list
Example #2
0
 def getContentNormalisedURLList(self):
     """
 Call url normalizer for each url returned by getContentURLList
 Return only url associated to the same Domain
 """
     reference_domain = urlsplit(normaliseUrl(self.asURL() or ''))[1]
     # in www.example.com or www.3.example.com
     # keep only the example.com part
     reference_domain = ''.join(reference_domain.split('.')[-2:])
     if isinstance(reference_domain, unicode):
         reference_domain = reference_domain.encode('utf-8')
     url_list = []
     base_url = self.getContentBaseURL()
     for url in self.getContentURLList():
         try:
             url = normaliseUrl(url, base_url=base_url)
         except UnicodeDecodeError:
             # Ignore wrong encoding errors
             # Web is not a kind world
             continue
         if not url:
             continue
         url_domain = urlsplit(url)[1]
         if isinstance(url_domain, unicode):
             url_domain = url_domain.encode('utf-8')
         if url_domain and ''.join(
                 url_domain.split('.')[-2:]) != reference_domain:
             continue
         # if domain is empty (relative link) or domain is same, then OK
         url_list.append(url)
     return url_list
Example #3
0
File: url.py Project: MarkTang/erp5
 def asNormalisedURL(self, base_url=None):
   """
   call normaliseUrl with raw url
   """
   if self.hasUrlString():
     return normaliseUrl(self.asURL(), base_url=base_url)
Example #4
0
 def asNormalisedURL(self, base_url=None):
     """
 call normaliseUrl with raw url
 """
     if self.hasUrlString():
         return normaliseUrl(self.asURL(), base_url=base_url)