def links(self, soup): links = [ full_url_address(link.attrs.get('href'), self.crawler_url.url) for link in soup.find_all('a') ] metas = filter( lambda meta: meta.attrs.get('http-equiv', '').lower() == 'refresh', soup.find_all('meta')) metas = filter(lambda meta: '=' in meta.attrs.get('content', ''), metas) links += list( map( lambda meta: full_url_address( meta.attrs['content'].split('=', 1)[1], self.crawler_url. url), metas)) for link in filter(bool, links): url = Url(link) if not url.is_valid(): continue depth = self.crawler_url.depth if url.domain != self.crawler_url.url.domain or \ not url.path.startswith(self.crawler_url.url.directory_path): depth -= 1 if depth <= 0: continue self.add_url(link, depth)
def get_links(self, text, soup=None): """ :param text: :param soup: :return: """ contents = list( filter(lambda x: isinstance(x, NavigableString) or is_link(x), soup.find('pre').contents)) links = [] for i, content in enumerate(contents): if not is_link(content) or '?' in content.attrs.get('href', ''): continue link = Url( full_url_address(content.attrs.get('href'), self.processor.crawler_url.url)) if i + 1 < len(contents) and isinstance(contents[i + 1], NavigableString): extra = {} text = str(contents[i + 1]) dt = DATETIME_PATTERN.findall(text) if dt: extra['created_at'] = dt[0] size = FILESIZE_PATTERN.findall(text) if size: extra['filesize'] = size[0].rstrip(' ') link.add_extra(extra) links.append(link) return links
def get_links(self, text, soup=None): links = [ full_url_address(link.attrs.get('href'), self.processor.crawler_url.url) for link in soup.find_all('a') ] return [Url(link) for link in links]
def links(self, soup): links = [ full_url_address(link.attrs.get('href'), self.crawler_url.url) for link in soup.find_all('a') ] for link in filter(bool, links): url = Url(link) if not url.is_valid(): continue depth = self.crawler_url.depth if url.domain != self.crawler_url.url.domain or \ not url.path.startswith(self.crawler_url.url.directory_path): depth -= 1 if depth <= 0: continue self.add_url(link, depth)
def process(self, text, soup=None): links = [ full_url_address(link.attrs.get('href'), self.crawler_url.url) for link in soup.find_all('a') ] for link in filter(lambda x: x.url.endswith('/'), links): self.add_url(link, type='directory') self.files = [Url(link) for link in links]
def __init__(self, crawler, url, depth=3, source=None, exists=None, type=None, timeout=10): """ :type crawler: Crawler :type depth: int Máxima recursión sin haber subido respecto esta url """ self.flags = set() self.depth = depth if not isinstance(url, Url): url = Url(url) if url.is_valid(): url.query = '' url.fragment = '' self.url = url self.crawler = crawler self.source = source self.exists = exists self.type = type self.timeout = timeout if url.is_valid() and (not url.path or url.path == '/'): self.type = 'directory' self.resp = None
def is_url_loop(url, ignore_end=True): url = url if isinstance(url, Url) else Url(url) directories = list(filter(bool, url.directories)) directories.reverse() for i in range(1, (len(directories) // MATCHS_LOOP_NUM) + 1): groups = [ tuple(directories[j:j + i]) for j in range(0, MATCHS_LOOP_NUM * i, i) ] if len(set(groups)) == 1 and len(groups) >= MATCHS_LOOP_NUM: return True if ignore_end: return is_url_loop(url.parent(), False) return False
def full_url_address(address, url): """ :type url: Url :type address: str :rtype :Url """ if address is None: return protocol_match = address.split(':', 1)[0] if ':' in address else '' protocol_match = re.match('^([A-z0-9\\-]+)$', protocol_match) if protocol_match and protocol_match.group(1) not in ACCEPTED_PROTOCOLS: return # TODO: mejorar esto. Aceptar otros protocolos a rechazar if address.startswith('//'): address = address.replace('//', '{}://'.format(url.protocol), 1) if '://' not in address or address.startswith('/'): url = url.copy() url.path = address return url url = Url(address) if url.is_valid(): return url
def _get_url_info(self): return UrlInfo(Sessions(), Url(self.url))
def test_callback(self): with patch.object(UrlsInfo, '_get_url_info') as m: UrlsInfo([self.url], Sessions()).callback(len(self.url), Url(self.url), 0) m.assert_called_once()