Esempi in Python per HTMLSession.head, esempi in Python per requests_html.HTMLSession.head

Esempio n. 1

0

Mostra file

File: SampleKafan.py Progetto: hewenwork/First

 def get_direct_link(link):
     session = HTMLSession()
     session.headers["Accept-language"] = "zh-CN"
     response = session.head(link)
     url = response.headers["Location"]
     session.close()
     return url

Esempio n. 2

0

Mostra file

File: ABS.py Progetto: ChristopherShort/tools_migration

def is_excel_file(url):
    """
    Check if the content type is an excel file

    Do this by looking in the content_type of the header
    """
    session = HTMLSession()

    h = session.head(url, allow_redirects=True)
    header = h.headers
    content_type = header.get("content-type")

    if "ms-excel" in content_type.lower():
        return True
    return False

Esempio n. 3

0

Mostra file

File: Links.py Progetto: pedrohfsantos/ValidadorPython

class Links:
    def __init__(self, url, links404, erroLink, RastrearLinks=True):
        self.url = url
        self.links404 = links404
        self.erroLink = erroLink
        self.RastrearLinks = RastrearLinks
        self.session = HTMLSession()
        self.linksConfirmados = {
            "Todos": [self.url],
            "Mapa Site": [],
            "MPI": []
        }
        self.cache = f"./Modulos/WebCache/{self.url_base(self.url, False)}" if not rastreador_links else f"./Modulos/WebCache/RastreadorLinks/{self.url_base(self.url, False)}"

        if rastreador_links:
            Path(f'./Modulos/WebCache/RastreadorLinks/').mkdir(parents=True,
                                                               exist_ok=True)

    @property
    def links_site(self):
        if self.RastrearLinks:
            self.rastrear(self.url)

        r = self.session.get(self.url + "mapa-site")
        mapaSite = r.html.find(".sitemap li a")

        if self.RastrearLinks:
            for linkMapaDoSite in mapaSite:
                self.linksConfirmados["Mapa Site"].append(
                    linkMapaDoSite.attrs["href"])
        else:
            for linkMapaDoSite in mapaSite:
                self.linksConfirmados["Mapa Site"].append(
                    linkMapaDoSite.attrs["href"])
                self.linksConfirmados["Todos"].append(
                    linkMapaDoSite.attrs["href"])

        subMenuInfo = r.html.find(".sitemap ul.sub-menu-info li a")
        for linkMPI in subMenuInfo:
            self.linksConfirmados["MPI"].append(linkMPI.attrs["href"])

        arquivo.cache(self.linksConfirmados, f"{self.cache}__cache.json")

        if not rastreador_links:
            self.valida_404(self.linksConfirmados["Todos"])

        return self.linksConfirmados

    def valida_url(self, url):
        return (True if "?" not in url and "#" not in url and ".jpg" not in url
                and ".JPG" not in url and ".jpeg" not in url
                and ".JPEG" not in url and ".png" not in url
                and ".PNG" not in url and ".pdf" not in url
                and "tel:" not in url and "mailto:" not in url else False)

    def rastrear(self, url):
        links = [url]

        for link in tqdm(links,
                         unit=" links",
                         desc=" Rastreando e categorizando os links",
                         leave=False):
            try:
                r = self.session.get(link)
                pageLinks = r.html.absolute_links

            except:
                self.erroLink.append(link)

            else:
                for pageLink in pageLinks:
                    if self.url_base(self.url) in pageLink and self.valida_url(
                            pageLink):
                        if pageLink not in links and link not in self.erroLink:
                            links.append(pageLink)

        self.linksConfirmados["Todos"] = links.copy()
        links.clear()

    def valida_404(self, urls):
        for url in tqdm(
                urls,
                unit=" links",
                desc=" Verificando se há links levando para página 404",
                leave=False):
            try:
                location = self.session.head(url).headers["Location"]

            except:
                continue
            else:
                if "/404" in location:
                    self.links404.append(url)
                    self.linksConfirmados["Todos"].remove(url)

    def url_base(self, limpaUrl, mpitemporario=True):
        limpaUrl = limpaUrl.split("//")
        limpaUrl = limpaUrl[1].split("/")
        return limpaUrl[0] if mpitemporario else [x for x in limpaUrl if x][-1]

Esempio n. 4

0

Mostra file

File: dl.py Progetto: franciscod/campus-fetch

class MoodleDL:
    def __init__(self, base_url='https://campus.exactas.uba.ar/'):
        self._session = HTMLSession()
        self._base_url = base_url
        self._processed_urls = set()

    def head(self, url, *args, **kwargs):
        if not url.startswith('http'):
            url = self._base_url + url

        return self._session.head(url, *args, **kwargs)

    def get(self, url, *args, **kwargs):
        if not url.startswith('http'):
            url = self._base_url + url

        return self._session.get(url, *args, **kwargs)

    def post(self, url, *args, **kwargs):
        if not url.startswith('http'):
            url = self._base_url + url

        return self._session.post(url, *args, **kwargs)

    def normalize_etag(self, etag):
        if etag.startswith('W/"') and etag.endswith('"'):
            return etag[3:-1]
        if etag.startswith('"') and etag.endswith('"'):
            return etag[1:-1]
        return etag

    def etag_sha1_matches(self, url, filename):
        # assumes ETag is the SHA1 of the file
        res = self.head(url, allow_redirects=True)
        etag = self.normalize_etag(res.headers.get('ETag'))

        if not etag:
            for r in res.history:
                etag = r.headers.get('ETag')
                if etag:
                    break
            else:
                print('No ETag on headers', filename, url)
                return False

        if not Path(filename).exists():
            # print('File not previously downloaded', filename, url)
            return False

        sha1 = hashlib.sha1()
        with open(filename, 'rb') as f:
            while True:
                data = f.read(HASH_BUF_SIZE)
                if not data:
                    break
                sha1.update(data)
        digest = sha1.hexdigest()

        if not digest == etag:
            print('Digest and ETag mismatch', filename, url)
            print(digest, etag)
            return False

        return True

    def download_file(self, url, name, basedir):
        if name is not None:
            filename = self.path(name, 'files_' + basedir)
            old_filename = os.path.join(OLD_BASE_DIR, filename)
            if self.etag_sha1_matches(url, old_filename):
                os.rename(old_filename, filename)
                return

        res = self.get(url)
        data = res.content

        if name is None:
            content_disp = res.headers.get('Content-Disposition')
            if content_disp:
                if content_disp.startswith("attachment; filename="):
                    cdname = content_disp[21:]
                    if cdname[0] == cdname[-1] == '"':
                        cdname = cdname[1:-1]
                    name = cdname
            if name is None:
                # TODO: this should be derived from some other information instead
                name = ''.join(random.sample(string.ascii_lowercase, 8))
            filename = self.path(name, 'files_' + basedir)

        with open(filename, 'wb') as f:
            f.write(data)

    def login(self, username, password):
        return self.post('login/index.php',
                         data={
                             'action': 'login',
                             'username': username,
                             'password': password,
                         })

    def agree_policy(self, res):
        return self.post('user/policy.php',
                         data={
                             'sesskey':
                             res.html.find(
                                 '#region-main form input[name=sesskey]',
                                 first=True).attrs['value'],
                             'agree':
                             '1'
                         })

    def fetch_course(self, course_name, course_id):
        self._course_id = course_id
        self._course_name = course_name
        self.rename_old()

        # get course main page
        res = self.get('course/view.php?id=%s' % course_id)

        # handle policy
        if 'policy' in res.url:
            res = self.agree_policy(res)

        self.parse_course(res)

    def parse_course(self, res):
        topics = res.html.find('ul.topics > li.section')
        if len(topics) == 1:
            self.recurse_in_tabs(res)
        else:
            raise NotImplementedError

    def base_path(self):
        return Path(DOWNLOADS_DIR) / slugify(self._course_name)

    def path(self, filename, *dir_parts):
        path = os.path.join(self.base_path().resolve(), *dir_parts)
        os.makedirs(path, exist_ok=True)
        return os.path.join(path, filename)

    def rename_old(self):
        path = self.base_path().resolve()
        old_path = os.path.join(OLD_BASE_DIR, path)
        if os.path.isdir(path):
            if os.path.isdir(old_path):
                shutil.rmtree(old_path)
            os.makedirs(old_path, exist_ok=True)
            os.rename(path, old_path)

    def recurse_in_tabs(self, res):
        if res.url in self._processed_urls:
            return
        self._processed_urls.add(res.url)

        for a in res.html.find('.nav-tabs li a'):
            href = a.attrs.get('href')
            if href and href not in self._processed_urls:
                self._processed_urls.add(res.url)
                newres = self._session.get(href)
                self.recurse_in_tabs(newres)

        if res.html.find('.errormessage'):
            return
        self.parse_section(res)

    def parse_content(self, res, title):
        content = res.html.find('#region-main .content', first=True)
        if content is None:
            content = res.html.find('#region-main [role="main"]', first=True)

        extra = []
        for iframe in content.find('iframe'):
            src = iframe.attrs.get('src')
            if not src:
                continue
            extra.append("- iframe: URL=" + src)

        h = HTML2Text(baseurl='')
        h.ul_item_mark = '-'
        md_content = h.handle(content.html)

        if extra:
            md_extra_content = '\n\n'.join(extra)
            md_content += md_extra_content

        if md_content.strip() != '':
            with open(self.path(slugify(title) + '.md'), 'w') as f:
                f.write('# ' + title + '\n([fuente](' + res.url + '))\n---\n')
                f.write(md_content)

        return content

    def parse_section(self, res):
        title = res.html.find('.breadcrumb li:last-child span a span',
                              first=True).text
        content = self.parse_content(res, title)

        for a in content.find('a'):
            href = a.attrs.get('href')
            if not href:
                continue

            section_prefix = "https://campus.exactas.uba.ar/course/view.php?id={}&section=".format(
                self._course_id)

            if '/mod/resource' in href:
                self.fetch_resource(href, slugify(title))
            elif '/mod/forum' in href:
                pass  # ignoring forum
            elif '/mod/url' in href:
                self.fetch_shortened_url(href, a.text)
            elif '/mod/page' in href:
                self.fetch_page_resource(href)
            elif href.startswith(section_prefix):
                self.fetch_section(href)
            else:
                print("unhandled resource", href, title, file=sys.stderr)

    def fetch_page_resource(self, url):
        if url in self._processed_urls:
            return

        self._processed_urls.add(url)

        res = self.get(url)
        self.parse_page_resource(res)

    def parse_page_resource(self, res):
        title = res.html.find('.breadcrumb li:last-child span a span',
                              first=True).text
        content = self.parse_content(res, title)

    def fetch_section(self, url):
        """Fetches a section from an URL that should look like
           /course/view.php?id={}&section={}, and then calls parse_section.
        """
        if url in self._processed_urls:
            return

        self._processed_urls.add(url)

        res = self.get(url)

        self.parse_section(res)

    def fetch_resource(self, url, basedir):
        res = self.get(url)

        def resource_url_name():
            content_disp = res.headers.get('Content-Disposition')
            if content_disp:
                if content_disp.startswith("inline; filename="):
                    filename = content_disp[17:]
                    if filename[0] == filename[-1] == '"':
                        filename = filename[1:-1]
                    return url, filename

            # try 'regular' moodle resource download page
            a = res.html.find('object a', first=True)
            if a:
                dl_url = href = a.attrs['href']
                dl_name = href.split('/')[-1]
                return dl_url, dl_name

            # try resourceimage page
            img = res.html.find('img.resourceimage', first=True)
            if img:
                dl_url = href = img.attrs['src']
                dl_name = href.split('/')[-1]
                return dl_url, dl_name

            # try raw download
            return url, None

        dl_url, dl_name = resource_url_name()

        if not dl_url:
            return

        self.download_file(dl_url, dl_name, basedir)

    def fetch_shortened_url(self, url, text):
        """Fetches an url that's behind a "shortened" URL, that looks like
           /mod/url/view.php?id={} and then stores the destination url.
        """
        if url in self._processed_urls:
            return

        url_id = int(url.split('/mod/url/view.php?id=')[-1])

        self._processed_urls.add(url)

        res = self.get(url)

        dest = res.url

        workaround = res.html.find('.urlworkaround', first=True)
        if workaround:
            dest = workaround.find("a", first=True).attrs['href']

        path = (self.base_path() / "urls" / str(url_id))
        path.parent.mkdir(parents=True, exist_ok=True)
        if text.endswith("URL"):
            text = text[:-3]
        path.write_text('# {}\nURL="{}"'.format(text, dest))

Esempio n. 5

0

Mostra file

File: Links.py Progetto: pedrohfsantos/CopyWeb

class Links:
    def __init__(self, log):
        self.session = HTMLSession()
        self.log = log

    def links_site(self, url):
        self.linksConfirmados = []
        self.rastrear(url)
        self.valida_404(self.linksConfirmados)
        return self.linksConfirmados

    def valida_url(self, url):
        return (
            True
            if "?" not in url
            and "#" not in url
            and ".jpg" not in url
            and ".JPG" not in url
            and ".jpeg" not in url
            and ".JPEG" not in url
            and ".png" not in url
            and ".PNG" not in url
            and "tel:" not in url
            and "mailto:" not in url
            else False
        )

    def rastrear(self, url):
        links = [url]

        for link in tqdm(links, unit=" links", desc=" Rastreando e categorizando os links", leave=False):
            try:
                r = self.session.get(link)
                links_pagina = r.html.absolute_links

            except Excpet as erro:
                self.log.append(f"{link} - ERRO: link não rastreado")

            else:
                for link_pagina in links_pagina:
                    if self.url_base(url) in link_pagina and self.valida_url(link_pagina):
                        if link_pagina not in links:
                            links.append(link_pagina)

        self.linksConfirmados = links.copy()

    def valida_404(self, links):
        for link in tqdm(links, unit=" links", desc=" Verificando se há links levando para página 404", leave=False):
        # for link in links:
            try:
                location = self.session.head(link).headers["Location"]

            except:
                continue

            else:
                if not location.endswith("/404"):
                    links.remove(link)

    def url_base(self, url, mpitemporario=False):
        url = url.split("//")
        url = url[1].split("/")
        url = url[0] if mpitemporario else [x for x in url if x][-1]
        return url.replace('www.', '')

Esempio n. 6

0

Mostra file

class Assets:
    def __init__(self, url_base, log):
        self.session = HTMLSession()
        self.log = log
        self.url_base = url_base
        self.raiz = url_base.split('/')[2].replace('www.', '')

    #Validar se o code é 200 antes de baixar o arquivo (Pendente)
    def imagens(self, url):
        r = self.session.get(url)
        imagens = r.html.xpath('//img/@src')
        for img in imagens:
            try:
                img_thumbs = f'(h.*)?inc/scripts/thumbs.php\?w=(.*?)&(amp;)?h=(.*?)\&(amp;)?imagem='
                img_tim = f'(h.*)?tim\.php\?src='
                img = re.sub(f'({img_thumbs}|{img_tim})', r'', img)

                if "?" in img:
                    img = img.split('?')[0]

                if "&" in img:
                    img = img.split('&')[0]

                img_local = img.replace(self.url_base, '')

                if not os.path.isfile(img_local):
                    diretorio = img_local.split('/')[:-1]

                    if not os.path.isdir(f"{self.raiz}/{'/'.join(diretorio)}"):
                        os.makedirs(f"{self.raiz}/{'/'.join(diretorio)}")

                    with open(f"{self.raiz}/{img_local}", "wb") as arquivo:
                        arquivo.write(self.session.get(img).content)

            except:
                self.log.append(f'Não foi possível baixar a imagem: {img}')

    def file_head(self, url):
        r = self.session.get(url)
        head_links = r.html.xpath('//link/@href | //script/@src')
        links_js = re.findall('script\([\'\"](.*?)[\'\"]\)\.wait', r.html.html)
        links_in_css = re.findall('url\([\'"]?(.*?)[\'"]?\)', r.html.html)

        scanner_file_css = [x for x in head_links if ".css" in x]
        for file in scanner_file_css:
            try:
                if not "http" in file:
                    file = f"{self.url_base}{file}"

                css = self.session.get(file)
                path_in_css = re.findall('url\([\'"]?(.*?)[\'"]?\)',
                                         css.html.html)
                path_in_css = [
                    re.sub(r'<\?=.*?\?>', r'', x) for x in path_in_css
                ]
                [head_links.append(x) for x in path_in_css]

            except:
                self.log.append(f"Não foi possível baixar o arquivo {file} ")

        [head_links.append(x) for x in links_in_css]
        [head_links.append(x) for x in links_js]
        [
            head_links.remove(x) for x in head_links
            if 'http' in x and not self.raiz in x
        ]
        head_links.remove(url) if url in head_links else None

        head_links = [x for x in head_links if not "flags/" in x]

        for head_link in head_links:
            try:
                head_link = head_link.replace('../', '')
                local = re.sub(f'.*?{self.raiz}/', r'', head_link)
                if "?" in local:
                    local = local.split('?')[0]

                if not os.path.isfile(local) and len(local) > 0:
                    diretorio = local.split('/')[:-1]

                    if not os.path.isdir(f"{self.raiz}/{'/'.join(diretorio)}"):
                        os.makedirs(f"{self.raiz}/{'/'.join(diretorio)}")

                    if not "http" in head_link:
                        head_link = f"{self.url_base}{head_link}"

                    if not self.is_404(f'{self.url_base}{local}'):
                        with open(f"{self.raiz}/{local}", "wb") as arquivo:
                            arquivo.write(self.session.get(head_link).content)
            except:
                self.log.append(
                    f"Não foi possível baixar o arquivo {head_link} ")

    def download_file(self, url):
        try:
            name_page = url.replace(self.url_base,
                                    '') if url != self.url_base else "index"
            if "?" in name_page:
                name_page = name_page.split('?')[0]

            if not os.path.isfile(name_page) and len(name_page) > 0:
                diretorio = name_page.split('/')[:-1]

            if not os.path.isdir(f"{self.raiz}/{'/'.join(diretorio)}"):
                os.makedirs(f"{self.raiz}/{'/'.join(diretorio)}")

            r = self.session.get(url)
            html = r.html.html
            url_regex = f"(http://|https://)?(www.)?{self.raiz}/"

            html = re.sub(r'<base href=".*?">', r'', html)

            img_thumbs = f'(h.*)?inc/scripts/thumbs.php\?w=(.*?)&(amp;)?h=(.*?)\&(amp;)?imagem='
            img_tim = f'(h.*)?tim\.php\?src='
            html = re.sub(f'({img_thumbs}|{img_tim})', r'', html)
            html = re.sub(f'<img(.*?)(src ?= ?)[\'\"](.*?)(\&.*?)[\'\"]',
                          r'<img\1\2"\3"', html)

            html = html.replace(f'../', '{{ path }}')
            html = re.sub(f'href="{url_regex}"',
                          f'href="' + '{{ path }}' + 'index.html"', html)
            html = re.sub(f'<a(.*?)href="{url_regex}(.*?)"',
                          r'<a\1href="' + '{{ path }}' + r'\4.html"', html)
            html = re.sub(f'href="(.*?\..*?)\.html"', r'href="\1"', html)
            html = html.replace(f'{self.url_base}', '{{ path }}')
            html = html.replace('{{ path }}', '../' * len(diretorio))

            with open(f"{self.raiz}/{name_page}.html", "w",
                      encoding="utf-8") as arquivo:
                arquivo.write(str(html))
        except:
            self.log.append(f"Não foi possível baixar o arquivo {name_page}")

    def is_404(self, link):
        try:
            location = self.session.head(link).headers["Location"]

        except:
            return False

        else:
            if location.endswith("/404"):
                return True

Esempio n. 7

0

Mostra file

File: Imagens.py Progetto: pedrohfsantos/ValidadorPython

class Imagens:
    def __init__(self, erroImgQuebrada, erroTamanho, erroTitleAlt,
                 erroValidador):
        self.session = HTMLSession()
        self.erroImgQuebrada = erroImgQuebrada
        self.erroTamanho = erroTamanho
        self.erroTitleAlt = erroTitleAlt
        self.erroValidador = erroValidador

    def verifica(self, pagina, imagens):
        try:
            for imagem in imagens:
                if imagem.attrs["src"][0:4].lower() != "http":
                    i = imagem.attrs["src"].find("http")
                    imagem.attrs["src"] = imagem.attrs["src"][i:]

                if str(self.session.head(
                        imagem.attrs["src"]).status_code) != "200":
                    self.erroImgQuebrada.append(
                        f"{pagina}: {imagem.attrs['src']}")
                else:
                    self.tamanho_imagem(imagem)

                self.alt_title(imagem, pagina)

            repetida = self.duplicado(self.erroTamanho)
            if repetida:
                del self.erroTamanho[repetida]
        except:
            self.erroValidador.append(pagina)

    def tamanho_imagem(self, imagem):
        if "&imagem=" not in imagem.attrs["src"]:
            tamanho = int(
                self.session.head(
                    imagem.attrs["src"]).headers["Content-Length"]) / 1024
            if round(tamanho) > 200:
                for Item in self.erroTamanho:
                    if Item.lower() != imagem.attrs["src"].lower():
                        self.erroTamanho.append(f"{imagem.attrs['src']}")

    def alt_title(self, imagem, pagina):
        try:
            if "escrev" in imagem.attrs["alt"].lower():
                self.erroTitleAlt.append(
                    f"{pagina} - src='{imagem.attrs['src']}' ALT com 'ESCREVA AQUI'"
                )

            elif "exemplo de mpi" in imagem.attrs["alt"].lower():
                self.erroTitleAlt.append(
                    f"{pagina} - src='{imagem.attrs['src']}' ALT com 'Exemplo de MPI'"
                )
        except:
            self.erroTitleAlt.append(
                f"{pagina} - src='{imagem.attrs['src']}' Imagem sem ALT")

        try:
            if "escrev" in imagem.attrs["title"].lower():
                self.erroTitleAlt.append(
                    f"{pagina} - src='{imagem.attrs['src']}' com TITLE 'ESCREVA AQUI'"
                )

            elif "exemplo de mpi" in imagem.attrs["title"].lower():
                self.erroTitleAlt.append(
                    f"{pagina} - src='{imagem.attrs['src']}' com TITLE 'Exemplo de MPI'"
                )
        except:
            self.erroTitleAlt.append(
                f"{pagina} - src='{imagem.attrs['src']}' - Imagem sem TITLE")

    def duplicado(self, lista):
        contagem = dict()
        for indice in lista:
            item = indice.strip()
            if item not in contagem.keys():
                contagem[item] = 1
            else:
                contagem[item] += 1

        for maxRepeticao in contagem.keys():
            if contagem[maxRepeticao] > 1:
                return maxRepeticao

Esempio n. 8

0

Mostra file

File: core.py Progetto: christophjason/webscrapeSelenium

class Worker(multiprocessing.Process):

    def __init__(self, unvisited_urls_queue, fetched_urls_queue, result_queue, counter, config):
        multiprocessing.Process.__init__(self)
        self.unvisited_urls_queue = unvisited_urls_queue
        self.fetched_urls_queue = fetched_urls_queue
        self.result_queue = result_queue
        self.counter = counter
        self.config = config
        self.kwargs = config["kwargs"]
        self.session = HTMLSession()

        a = adapters.HTTPAdapter(
            pool_connections = 100,
            pool_maxsize = 100
        )
        self.session.mount("http://", a)
        self.session.mount("https://", a)

    def get_url_type(self, url, resp):

        for include_snippet in self.config["include"]:
            if include_snippet in url:
                content_type = resp.headers.get('Content-Type', None)
                if content_type and "text/html" in content_type:
                    url_type = 'recursive'
                else:
                    url_type = 'static'

                return url_type
            else:
                continue

        return "external"

    def check_url_info(self, url):
        for exclude_snippet in self.config["exclude"]:
            if exclude_snippet in url:
                status_code = None
                url_type = "exclude"
                return (status_code, url_type)

        try:
            resp = self.session.head(url, **self.kwargs)
            status_code = resp.status_code
            url_type = self.get_url_type(url, resp)
        except exceptions.ConnectTimeout as ex:
            color_logging(f"{url}: {str(ex)}", 'WARNING')
            status_code = "ConnectTimeout"
            url_type = None
        except exceptions.ConnectionError as ex:
            color_logging(f"{url}: {str(ex)}", 'WARNING')
            status_code = "ConnectionError"
            url_type = None

        return (status_code, url_type)

    def get_hyper_links(self, url):
        # session.browser
        status_code = None
        hyper_links = set()

        try:
            resp = self.session.get(url, **self.kwargs)
            status_code = resp.status_code
        except exceptions.ConnectionError as ex:
            color_logging(f"{url}: {str(ex)}", 'ERROR')
            status_code = "ConnectionError"

        try:
            resp.html.render(sleep=1, timeout=30)
            hyper_links = resp.html.absolute_links
        except lxml.etree.ParserError as ex:
            color_logging(f"{url}: {str(ex)}", 'ERROR')
        except UnicodeDecodeError as ex:
            color_logging(f"{url}: {str(ex)}", 'ERROR')
        except MaxRetries as ex:
            color_logging(f"{url}: {str(ex)}", 'ERROR')

        return (status_code, hyper_links)

    def run(self):
        while True:
            unvisited_url = self.unvisited_urls_queue.get()
            if unvisited_url is None:
                # Poison pill means shutdown
                color_logging(f'{self.name}: Exiting')
                self.unvisited_urls_queue.task_done()
                break

            start_time = time.time()
            status_code, url_type = self.check_url_info(unvisited_url)

            method = "HEAD"
            if url_type in ["exclude"]:
                color_logging(f"skip url: {unvisited_url}", color="blue")
                self.unvisited_urls_queue.task_done()
                continue
            if url_type in ['static', 'external']:
                hyper_links = set()
            elif url_type in ['recursive']:
                method = "GET & Render"
                status_code, hyper_links = self.get_hyper_links(unvisited_url)
            else:
                # url_type is None
                # TODO: raise exception
                hyper_links = set()

            duration_time = time.time() - start_time
            result = (unvisited_url, status_code, duration_time, hyper_links)
            self.result_queue.put(result)

            for link in hyper_links:
                self.fetched_urls_queue.put(link)

            self.unvisited_urls_queue.task_done()
            self.counter.value += 1

            color_logging(f"index: {self.counter.value}, {method} {unvisited_url}, status_code: {status_code}, duration_time: {duration_time}, worker: {self.name}", color="white")