Ejemplo n.º 1
0
 def links(self, soup):
     links = [
         full_url_address(link.attrs.get('href'), self.crawler_url.url)
         for link in soup.find_all('a')
     ]
     metas = filter(
         lambda meta: meta.attrs.get('http-equiv', '').lower() == 'refresh',
         soup.find_all('meta'))
     metas = filter(lambda meta: '=' in meta.attrs.get('content', ''),
                    metas)
     links += list(
         map(
             lambda meta: full_url_address(
                 meta.attrs['content'].split('=', 1)[1], self.crawler_url.
                 url), metas))
     for link in filter(bool, links):
         url = Url(link)
         if not url.is_valid():
             continue
         depth = self.crawler_url.depth
         if url.domain != self.crawler_url.url.domain or \
                 not url.path.startswith(self.crawler_url.url.directory_path):
             depth -= 1
         if depth <= 0:
             continue
         self.add_url(link, depth)
Ejemplo n.º 2
0
 def get_links(self, text, soup=None):
     links = [
         full_url_address(link.attrs.get('href'),
                          self.processor.crawler_url.url)
         for link in soup.find_all('a')
     ]
     return [Url(link) for link in links]
Ejemplo n.º 3
0
 def get_links(self, text, soup=None):
     """
     :param text:
     :param soup:
     :return:
     """
     contents = list(
         filter(lambda x: isinstance(x, NavigableString) or is_link(x),
                soup.find('pre').contents))
     links = []
     for i, content in enumerate(contents):
         if not is_link(content) or '?' in content.attrs.get('href', ''):
             continue
         link = Url(
             full_url_address(content.attrs.get('href'),
                              self.processor.crawler_url.url))
         if i + 1 < len(contents) and isinstance(contents[i + 1],
                                                 NavigableString):
             extra = {}
             text = str(contents[i + 1])
             dt = DATETIME_PATTERN.findall(text)
             if dt:
                 extra['created_at'] = dt[0]
             size = FILESIZE_PATTERN.findall(text)
             if size:
                 extra['filesize'] = size[0].rstrip(' ')
             link.add_extra(extra)
         links.append(link)
     return links
Ejemplo n.º 4
0
 def assets(self, soup):
     assets = [
         full_url_address(link.attrs.get('href'), self.crawler_url.url)
         for link in soup.find_all('link')
     ]
     assets += [
         full_url_address(script.attrs.get('src'), self.crawler_url.url)
         for script in soup.find_all('script')
     ]
     assets += [
         full_url_address(img.attrs.get('src'), self.crawler_url.url)
         for img in soup.find_all('img')
     ]
     for asset in filter(bool, assets):
         self.analyze_asset(asset)
         self.add_url(asset, type='asset')
Ejemplo n.º 5
0
 def process(self, text, soup=None):
     if sys.version_info > (3, ) and isinstance(text, bytes):
         text = text.decode('utf-8')
     urls = [
         full_url_address(url[0], self.crawler_url.url) for url in
         re.findall(TEXT_PLAIN_PATH_STRING_REGEX, text, re.VERBOSE)
     ]
     for url in urls:
         self.add_url(url, depth=0, type='asset')
     return urls
Ejemplo n.º 6
0
 def process(self, text, soup=None):
     if sys.version_info > (3, ) and isinstance(text, bytes):
         text = text.decode('utf-8')
     urls = [
         full_url_address(url, self.crawler_url.url)
         for url in re.findall(': *url\(["\']?(.+?)["\']?\)', text)
     ]
     for url in urls:
         self.add_url(url, depth=0, type='asset')
     return urls
Ejemplo n.º 7
0
 def links(self, soup):
     links = [
         full_url_address(link.attrs.get('href'), self.crawler_url.url)
         for link in soup.find_all('a')
     ]
     for link in filter(bool, links):
         url = Url(link)
         if not url.is_valid():
             continue
         depth = self.crawler_url.depth
         if url.domain != self.crawler_url.url.domain or \
                 not url.path.startswith(self.crawler_url.url.directory_path):
             depth -= 1
         if depth <= 0:
             continue
         self.add_url(link, depth)
Ejemplo n.º 8
0
 def __init__(self, response, crawler_url):
     super(ProcessRedirect, self).__init__(response, crawler_url)
     self.redirector = full_url_address(response.headers.get('Location'),
                                        self.crawler_url.url)