def get_direct_link(link): session = HTMLSession() session.headers["Accept-language"] = "zh-CN" response = session.head(link) url = response.headers["Location"] session.close() return url
def is_excel_file(url): """ Check if the content type is an excel file Do this by looking in the content_type of the header """ session = HTMLSession() h = session.head(url, allow_redirects=True) header = h.headers content_type = header.get("content-type") if "ms-excel" in content_type.lower(): return True return False
class Links: def __init__(self, url, links404, erroLink, RastrearLinks=True): self.url = url self.links404 = links404 self.erroLink = erroLink self.RastrearLinks = RastrearLinks self.session = HTMLSession() self.linksConfirmados = { "Todos": [self.url], "Mapa Site": [], "MPI": [] } self.cache = f"./Modulos/WebCache/{self.url_base(self.url, False)}" if not rastreador_links else f"./Modulos/WebCache/RastreadorLinks/{self.url_base(self.url, False)}" if rastreador_links: Path(f'./Modulos/WebCache/RastreadorLinks/').mkdir(parents=True, exist_ok=True) @property def links_site(self): if self.RastrearLinks: self.rastrear(self.url) r = self.session.get(self.url + "mapa-site") mapaSite = r.html.find(".sitemap li a") if self.RastrearLinks: for linkMapaDoSite in mapaSite: self.linksConfirmados["Mapa Site"].append( linkMapaDoSite.attrs["href"]) else: for linkMapaDoSite in mapaSite: self.linksConfirmados["Mapa Site"].append( linkMapaDoSite.attrs["href"]) self.linksConfirmados["Todos"].append( linkMapaDoSite.attrs["href"]) subMenuInfo = r.html.find(".sitemap ul.sub-menu-info li a") for linkMPI in subMenuInfo: self.linksConfirmados["MPI"].append(linkMPI.attrs["href"]) arquivo.cache(self.linksConfirmados, f"{self.cache}__cache.json") if not rastreador_links: self.valida_404(self.linksConfirmados["Todos"]) return self.linksConfirmados def valida_url(self, url): return (True if "?" not in url and "#" not in url and ".jpg" not in url and ".JPG" not in url and ".jpeg" not in url and ".JPEG" not in url and ".png" not in url and ".PNG" not in url and ".pdf" not in url and "tel:" not in url and "mailto:" not in url else False) def rastrear(self, url): links = [url] for link in tqdm(links, unit=" links", desc=" Rastreando e categorizando os links", leave=False): try: r = self.session.get(link) pageLinks = r.html.absolute_links except: self.erroLink.append(link) else: for pageLink in pageLinks: if self.url_base(self.url) in pageLink and self.valida_url( pageLink): if pageLink not in links and link not in self.erroLink: links.append(pageLink) self.linksConfirmados["Todos"] = links.copy() links.clear() def valida_404(self, urls): for url in tqdm( urls, unit=" links", desc=" Verificando se há links levando para página 404", leave=False): try: location = self.session.head(url).headers["Location"] except: continue else: if "/404" in location: self.links404.append(url) self.linksConfirmados["Todos"].remove(url) def url_base(self, limpaUrl, mpitemporario=True): limpaUrl = limpaUrl.split("//") limpaUrl = limpaUrl[1].split("/") return limpaUrl[0] if mpitemporario else [x for x in limpaUrl if x][-1]
class MoodleDL: def __init__(self, base_url='https://campus.exactas.uba.ar/'): self._session = HTMLSession() self._base_url = base_url self._processed_urls = set() def head(self, url, *args, **kwargs): if not url.startswith('http'): url = self._base_url + url return self._session.head(url, *args, **kwargs) def get(self, url, *args, **kwargs): if not url.startswith('http'): url = self._base_url + url return self._session.get(url, *args, **kwargs) def post(self, url, *args, **kwargs): if not url.startswith('http'): url = self._base_url + url return self._session.post(url, *args, **kwargs) def normalize_etag(self, etag): if etag.startswith('W/"') and etag.endswith('"'): return etag[3:-1] if etag.startswith('"') and etag.endswith('"'): return etag[1:-1] return etag def etag_sha1_matches(self, url, filename): # assumes ETag is the SHA1 of the file res = self.head(url, allow_redirects=True) etag = self.normalize_etag(res.headers.get('ETag')) if not etag: for r in res.history: etag = r.headers.get('ETag') if etag: break else: print('No ETag on headers', filename, url) return False if not Path(filename).exists(): # print('File not previously downloaded', filename, url) return False sha1 = hashlib.sha1() with open(filename, 'rb') as f: while True: data = f.read(HASH_BUF_SIZE) if not data: break sha1.update(data) digest = sha1.hexdigest() if not digest == etag: print('Digest and ETag mismatch', filename, url) print(digest, etag) return False return True def download_file(self, url, name, basedir): if name is not None: filename = self.path(name, 'files_' + basedir) old_filename = os.path.join(OLD_BASE_DIR, filename) if self.etag_sha1_matches(url, old_filename): os.rename(old_filename, filename) return res = self.get(url) data = res.content if name is None: content_disp = res.headers.get('Content-Disposition') if content_disp: if content_disp.startswith("attachment; filename="): cdname = content_disp[21:] if cdname[0] == cdname[-1] == '"': cdname = cdname[1:-1] name = cdname if name is None: # TODO: this should be derived from some other information instead name = ''.join(random.sample(string.ascii_lowercase, 8)) filename = self.path(name, 'files_' + basedir) with open(filename, 'wb') as f: f.write(data) def login(self, username, password): return self.post('login/index.php', data={ 'action': 'login', 'username': username, 'password': password, }) def agree_policy(self, res): return self.post('user/policy.php', data={ 'sesskey': res.html.find( '#region-main form input[name=sesskey]', first=True).attrs['value'], 'agree': '1' }) def fetch_course(self, course_name, course_id): self._course_id = course_id self._course_name = course_name self.rename_old() # get course main page res = self.get('course/view.php?id=%s' % course_id) # handle policy if 'policy' in res.url: res = self.agree_policy(res) self.parse_course(res) def parse_course(self, res): topics = res.html.find('ul.topics > li.section') if len(topics) == 1: self.recurse_in_tabs(res) else: raise NotImplementedError def base_path(self): return Path(DOWNLOADS_DIR) / slugify(self._course_name) def path(self, filename, *dir_parts): path = os.path.join(self.base_path().resolve(), *dir_parts) os.makedirs(path, exist_ok=True) return os.path.join(path, filename) def rename_old(self): path = self.base_path().resolve() old_path = os.path.join(OLD_BASE_DIR, path) if os.path.isdir(path): if os.path.isdir(old_path): shutil.rmtree(old_path) os.makedirs(old_path, exist_ok=True) os.rename(path, old_path) def recurse_in_tabs(self, res): if res.url in self._processed_urls: return self._processed_urls.add(res.url) for a in res.html.find('.nav-tabs li a'): href = a.attrs.get('href') if href and href not in self._processed_urls: self._processed_urls.add(res.url) newres = self._session.get(href) self.recurse_in_tabs(newres) if res.html.find('.errormessage'): return self.parse_section(res) def parse_content(self, res, title): content = res.html.find('#region-main .content', first=True) if content is None: content = res.html.find('#region-main [role="main"]', first=True) extra = [] for iframe in content.find('iframe'): src = iframe.attrs.get('src') if not src: continue extra.append("- iframe: URL=" + src) h = HTML2Text(baseurl='') h.ul_item_mark = '-' md_content = h.handle(content.html) if extra: md_extra_content = '\n\n'.join(extra) md_content += md_extra_content if md_content.strip() != '': with open(self.path(slugify(title) + '.md'), 'w') as f: f.write('# ' + title + '\n([fuente](' + res.url + '))\n---\n') f.write(md_content) return content def parse_section(self, res): title = res.html.find('.breadcrumb li:last-child span a span', first=True).text content = self.parse_content(res, title) for a in content.find('a'): href = a.attrs.get('href') if not href: continue section_prefix = "https://campus.exactas.uba.ar/course/view.php?id={}§ion=".format( self._course_id) if '/mod/resource' in href: self.fetch_resource(href, slugify(title)) elif '/mod/forum' in href: pass # ignoring forum elif '/mod/url' in href: self.fetch_shortened_url(href, a.text) elif '/mod/page' in href: self.fetch_page_resource(href) elif href.startswith(section_prefix): self.fetch_section(href) else: print("unhandled resource", href, title, file=sys.stderr) def fetch_page_resource(self, url): if url in self._processed_urls: return self._processed_urls.add(url) res = self.get(url) self.parse_page_resource(res) def parse_page_resource(self, res): title = res.html.find('.breadcrumb li:last-child span a span', first=True).text content = self.parse_content(res, title) def fetch_section(self, url): """Fetches a section from an URL that should look like /course/view.php?id={}§ion={}, and then calls parse_section. """ if url in self._processed_urls: return self._processed_urls.add(url) res = self.get(url) self.parse_section(res) def fetch_resource(self, url, basedir): res = self.get(url) def resource_url_name(): content_disp = res.headers.get('Content-Disposition') if content_disp: if content_disp.startswith("inline; filename="): filename = content_disp[17:] if filename[0] == filename[-1] == '"': filename = filename[1:-1] return url, filename # try 'regular' moodle resource download page a = res.html.find('object a', first=True) if a: dl_url = href = a.attrs['href'] dl_name = href.split('/')[-1] return dl_url, dl_name # try resourceimage page img = res.html.find('img.resourceimage', first=True) if img: dl_url = href = img.attrs['src'] dl_name = href.split('/')[-1] return dl_url, dl_name # try raw download return url, None dl_url, dl_name = resource_url_name() if not dl_url: return self.download_file(dl_url, dl_name, basedir) def fetch_shortened_url(self, url, text): """Fetches an url that's behind a "shortened" URL, that looks like /mod/url/view.php?id={} and then stores the destination url. """ if url in self._processed_urls: return url_id = int(url.split('/mod/url/view.php?id=')[-1]) self._processed_urls.add(url) res = self.get(url) dest = res.url workaround = res.html.find('.urlworkaround', first=True) if workaround: dest = workaround.find("a", first=True).attrs['href'] path = (self.base_path() / "urls" / str(url_id)) path.parent.mkdir(parents=True, exist_ok=True) if text.endswith("URL"): text = text[:-3] path.write_text('# {}\nURL="{}"'.format(text, dest))
class Links: def __init__(self, log): self.session = HTMLSession() self.log = log def links_site(self, url): self.linksConfirmados = [] self.rastrear(url) self.valida_404(self.linksConfirmados) return self.linksConfirmados def valida_url(self, url): return ( True if "?" not in url and "#" not in url and ".jpg" not in url and ".JPG" not in url and ".jpeg" not in url and ".JPEG" not in url and ".png" not in url and ".PNG" not in url and "tel:" not in url and "mailto:" not in url else False ) def rastrear(self, url): links = [url] for link in tqdm(links, unit=" links", desc=" Rastreando e categorizando os links", leave=False): try: r = self.session.get(link) links_pagina = r.html.absolute_links except Excpet as erro: self.log.append(f"{link} - ERRO: link não rastreado") else: for link_pagina in links_pagina: if self.url_base(url) in link_pagina and self.valida_url(link_pagina): if link_pagina not in links: links.append(link_pagina) self.linksConfirmados = links.copy() def valida_404(self, links): for link in tqdm(links, unit=" links", desc=" Verificando se há links levando para página 404", leave=False): # for link in links: try: location = self.session.head(link).headers["Location"] except: continue else: if not location.endswith("/404"): links.remove(link) def url_base(self, url, mpitemporario=False): url = url.split("//") url = url[1].split("/") url = url[0] if mpitemporario else [x for x in url if x][-1] return url.replace('www.', '')
class Assets: def __init__(self, url_base, log): self.session = HTMLSession() self.log = log self.url_base = url_base self.raiz = url_base.split('/')[2].replace('www.', '') #Validar se o code é 200 antes de baixar o arquivo (Pendente) def imagens(self, url): r = self.session.get(url) imagens = r.html.xpath('//img/@src') for img in imagens: try: img_thumbs = f'(h.*)?inc/scripts/thumbs.php\?w=(.*?)&(amp;)?h=(.*?)\&(amp;)?imagem=' img_tim = f'(h.*)?tim\.php\?src=' img = re.sub(f'({img_thumbs}|{img_tim})', r'', img) if "?" in img: img = img.split('?')[0] if "&" in img: img = img.split('&')[0] img_local = img.replace(self.url_base, '') if not os.path.isfile(img_local): diretorio = img_local.split('/')[:-1] if not os.path.isdir(f"{self.raiz}/{'/'.join(diretorio)}"): os.makedirs(f"{self.raiz}/{'/'.join(diretorio)}") with open(f"{self.raiz}/{img_local}", "wb") as arquivo: arquivo.write(self.session.get(img).content) except: self.log.append(f'Não foi possível baixar a imagem: {img}') def file_head(self, url): r = self.session.get(url) head_links = r.html.xpath('//link/@href | //script/@src') links_js = re.findall('script\([\'\"](.*?)[\'\"]\)\.wait', r.html.html) links_in_css = re.findall('url\([\'"]?(.*?)[\'"]?\)', r.html.html) scanner_file_css = [x for x in head_links if ".css" in x] for file in scanner_file_css: try: if not "http" in file: file = f"{self.url_base}{file}" css = self.session.get(file) path_in_css = re.findall('url\([\'"]?(.*?)[\'"]?\)', css.html.html) path_in_css = [ re.sub(r'<\?=.*?\?>', r'', x) for x in path_in_css ] [head_links.append(x) for x in path_in_css] except: self.log.append(f"Não foi possível baixar o arquivo {file} ") [head_links.append(x) for x in links_in_css] [head_links.append(x) for x in links_js] [ head_links.remove(x) for x in head_links if 'http' in x and not self.raiz in x ] head_links.remove(url) if url in head_links else None head_links = [x for x in head_links if not "flags/" in x] for head_link in head_links: try: head_link = head_link.replace('../', '') local = re.sub(f'.*?{self.raiz}/', r'', head_link) if "?" in local: local = local.split('?')[0] if not os.path.isfile(local) and len(local) > 0: diretorio = local.split('/')[:-1] if not os.path.isdir(f"{self.raiz}/{'/'.join(diretorio)}"): os.makedirs(f"{self.raiz}/{'/'.join(diretorio)}") if not "http" in head_link: head_link = f"{self.url_base}{head_link}" if not self.is_404(f'{self.url_base}{local}'): with open(f"{self.raiz}/{local}", "wb") as arquivo: arquivo.write(self.session.get(head_link).content) except: self.log.append( f"Não foi possível baixar o arquivo {head_link} ") def download_file(self, url): try: name_page = url.replace(self.url_base, '') if url != self.url_base else "index" if "?" in name_page: name_page = name_page.split('?')[0] if not os.path.isfile(name_page) and len(name_page) > 0: diretorio = name_page.split('/')[:-1] if not os.path.isdir(f"{self.raiz}/{'/'.join(diretorio)}"): os.makedirs(f"{self.raiz}/{'/'.join(diretorio)}") r = self.session.get(url) html = r.html.html url_regex = f"(http://|https://)?(www.)?{self.raiz}/" html = re.sub(r'<base href=".*?">', r'', html) img_thumbs = f'(h.*)?inc/scripts/thumbs.php\?w=(.*?)&(amp;)?h=(.*?)\&(amp;)?imagem=' img_tim = f'(h.*)?tim\.php\?src=' html = re.sub(f'({img_thumbs}|{img_tim})', r'', html) html = re.sub(f'<img(.*?)(src ?= ?)[\'\"](.*?)(\&.*?)[\'\"]', r'<img\1\2"\3"', html) html = html.replace(f'../', '{{ path }}') html = re.sub(f'href="{url_regex}"', f'href="' + '{{ path }}' + 'index.html"', html) html = re.sub(f'<a(.*?)href="{url_regex}(.*?)"', r'<a\1href="' + '{{ path }}' + r'\4.html"', html) html = re.sub(f'href="(.*?\..*?)\.html"', r'href="\1"', html) html = html.replace(f'{self.url_base}', '{{ path }}') html = html.replace('{{ path }}', '../' * len(diretorio)) with open(f"{self.raiz}/{name_page}.html", "w", encoding="utf-8") as arquivo: arquivo.write(str(html)) except: self.log.append(f"Não foi possível baixar o arquivo {name_page}") def is_404(self, link): try: location = self.session.head(link).headers["Location"] except: return False else: if location.endswith("/404"): return True
class Imagens: def __init__(self, erroImgQuebrada, erroTamanho, erroTitleAlt, erroValidador): self.session = HTMLSession() self.erroImgQuebrada = erroImgQuebrada self.erroTamanho = erroTamanho self.erroTitleAlt = erroTitleAlt self.erroValidador = erroValidador def verifica(self, pagina, imagens): try: for imagem in imagens: if imagem.attrs["src"][0:4].lower() != "http": i = imagem.attrs["src"].find("http") imagem.attrs["src"] = imagem.attrs["src"][i:] if str(self.session.head( imagem.attrs["src"]).status_code) != "200": self.erroImgQuebrada.append( f"{pagina}: {imagem.attrs['src']}") else: self.tamanho_imagem(imagem) self.alt_title(imagem, pagina) repetida = self.duplicado(self.erroTamanho) if repetida: del self.erroTamanho[repetida] except: self.erroValidador.append(pagina) def tamanho_imagem(self, imagem): if "&imagem=" not in imagem.attrs["src"]: tamanho = int( self.session.head( imagem.attrs["src"]).headers["Content-Length"]) / 1024 if round(tamanho) > 200: for Item in self.erroTamanho: if Item.lower() != imagem.attrs["src"].lower(): self.erroTamanho.append(f"{imagem.attrs['src']}") def alt_title(self, imagem, pagina): try: if "escrev" in imagem.attrs["alt"].lower(): self.erroTitleAlt.append( f"{pagina} - src='{imagem.attrs['src']}' ALT com 'ESCREVA AQUI'" ) elif "exemplo de mpi" in imagem.attrs["alt"].lower(): self.erroTitleAlt.append( f"{pagina} - src='{imagem.attrs['src']}' ALT com 'Exemplo de MPI'" ) except: self.erroTitleAlt.append( f"{pagina} - src='{imagem.attrs['src']}' Imagem sem ALT") try: if "escrev" in imagem.attrs["title"].lower(): self.erroTitleAlt.append( f"{pagina} - src='{imagem.attrs['src']}' com TITLE 'ESCREVA AQUI'" ) elif "exemplo de mpi" in imagem.attrs["title"].lower(): self.erroTitleAlt.append( f"{pagina} - src='{imagem.attrs['src']}' com TITLE 'Exemplo de MPI'" ) except: self.erroTitleAlt.append( f"{pagina} - src='{imagem.attrs['src']}' - Imagem sem TITLE") def duplicado(self, lista): contagem = dict() for indice in lista: item = indice.strip() if item not in contagem.keys(): contagem[item] = 1 else: contagem[item] += 1 for maxRepeticao in contagem.keys(): if contagem[maxRepeticao] > 1: return maxRepeticao
class Worker(multiprocessing.Process): def __init__(self, unvisited_urls_queue, fetched_urls_queue, result_queue, counter, config): multiprocessing.Process.__init__(self) self.unvisited_urls_queue = unvisited_urls_queue self.fetched_urls_queue = fetched_urls_queue self.result_queue = result_queue self.counter = counter self.config = config self.kwargs = config["kwargs"] self.session = HTMLSession() a = adapters.HTTPAdapter( pool_connections = 100, pool_maxsize = 100 ) self.session.mount("http://", a) self.session.mount("https://", a) def get_url_type(self, url, resp): for include_snippet in self.config["include"]: if include_snippet in url: content_type = resp.headers.get('Content-Type', None) if content_type and "text/html" in content_type: url_type = 'recursive' else: url_type = 'static' return url_type else: continue return "external" def check_url_info(self, url): for exclude_snippet in self.config["exclude"]: if exclude_snippet in url: status_code = None url_type = "exclude" return (status_code, url_type) try: resp = self.session.head(url, **self.kwargs) status_code = resp.status_code url_type = self.get_url_type(url, resp) except exceptions.ConnectTimeout as ex: color_logging(f"{url}: {str(ex)}", 'WARNING') status_code = "ConnectTimeout" url_type = None except exceptions.ConnectionError as ex: color_logging(f"{url}: {str(ex)}", 'WARNING') status_code = "ConnectionError" url_type = None return (status_code, url_type) def get_hyper_links(self, url): # session.browser status_code = None hyper_links = set() try: resp = self.session.get(url, **self.kwargs) status_code = resp.status_code except exceptions.ConnectionError as ex: color_logging(f"{url}: {str(ex)}", 'ERROR') status_code = "ConnectionError" try: resp.html.render(sleep=1, timeout=30) hyper_links = resp.html.absolute_links except lxml.etree.ParserError as ex: color_logging(f"{url}: {str(ex)}", 'ERROR') except UnicodeDecodeError as ex: color_logging(f"{url}: {str(ex)}", 'ERROR') except MaxRetries as ex: color_logging(f"{url}: {str(ex)}", 'ERROR') return (status_code, hyper_links) def run(self): while True: unvisited_url = self.unvisited_urls_queue.get() if unvisited_url is None: # Poison pill means shutdown color_logging(f'{self.name}: Exiting') self.unvisited_urls_queue.task_done() break start_time = time.time() status_code, url_type = self.check_url_info(unvisited_url) method = "HEAD" if url_type in ["exclude"]: color_logging(f"skip url: {unvisited_url}", color="blue") self.unvisited_urls_queue.task_done() continue if url_type in ['static', 'external']: hyper_links = set() elif url_type in ['recursive']: method = "GET & Render" status_code, hyper_links = self.get_hyper_links(unvisited_url) else: # url_type is None # TODO: raise exception hyper_links = set() duration_time = time.time() - start_time result = (unvisited_url, status_code, duration_time, hyper_links) self.result_queue.put(result) for link in hyper_links: self.fetched_urls_queue.put(link) self.unvisited_urls_queue.task_done() self.counter.value += 1 color_logging(f"index: {self.counter.value}, {method} {unvisited_url}, status_code: {status_code}, duration_time: {duration_time}, worker: {self.name}", color="white")