Beispiel #1
0
 def __get_anchors(self):
     if not hasattr(self, '__anchors'):
         if not self.context:
             self.__anchors = self.raw_anchors
         else:
             self.__anchors = list()
             for anchor in self.raw_anchors:
                 if anchor.startswith('http://') or anchor.startswith(
                         'https://'):
                     self.__anchors.append(anchor)
                     continue
                 if '../' in anchor:
                     # TODO Process relative anchor and continue
                     continue
                 if anchor.startswith('/'):
                     uri_scheme, authority, port, path = normalize_url(
                         self.context)
                     anchor = "{0}{1}{2}".format(uri_scheme, authority,
                                                 anchor)
                     self.__anchors.append(anchor)
                     continue
                 if not anchor.startswith('/'):
                     uri_scheme, authority, port, path = normalize_url(
                         self.context)
                     pieces = path[1:].split('/')
                     pieces.pop()
                     if len(pieces) > 0: path = '/'.join(pieces)
                     if len(pieces) > 1: path += '/'
                     else: path = ''
                     anchor = "{0}{1}/{2}{3}".format(
                         uri_scheme, authority, path, anchor)
                     self.__anchors.append(anchor)
                     continue
     return self.__anchors
 def __get_anchors(self):
     if not hasattr(self, '__anchors'):
         if not self.context:
             self.__anchors = self.raw_anchors
         else:
             self.__anchors = list()
             for anchor in self.raw_anchors:
                 if anchor.startswith('http://') or anchor.startswith('https://'):
                     self.__anchors.append(anchor)
                     continue
                 if '../' in anchor:
                     # TODO Process relative anchor and continue
                     continue
                 if anchor.startswith('/'):
                     uri_scheme, authority, port, path = normalize_url(self.context)
                     anchor = "{0}{1}{2}".format(uri_scheme, authority, anchor)
                     self.__anchors.append(anchor)
                     continue
                 if not anchor.startswith('/'):
                     uri_scheme, authority, port, path = normalize_url(self.context)
                     pieces = path[1:].split('/'); pieces.pop()
                     if len(pieces) > 0: path = '/'.join(pieces)
                     if len(pieces) > 1: path += '/'
                     else: path = ''
                     anchor = "{0}{1}/{2}{3}".format(uri_scheme, authority, path, anchor)
                     self.__anchors.append(anchor)
                     continue
     return self.__anchors
def can_access(url):
    from crawler.connection import normalize_url
    uri_scheme, authority, port, path = normalize_url(url)
    del uri_scheme, port
    robot = Robot(authority)
    value = robot.can_access(path)
    del robot
    return value
def can_access(url):
    from crawler.connection import normalize_url
    uri_scheme, authority, port, path = normalize_url(url)
    del uri_scheme, port
    robot = Robot(authority)
    value = robot.can_access(path)
    del robot
    return value