Ejemplo n.º 1
0
def getBrokenlinks(url):
    broken_links = 0
    threads = []
    try:
        soup = url.getsoup()
    except WebcredError as e:
        raise WebcredError(e.message)
    except:
        raise WebcredError('Url is broken')

    for link in soup.find_all('a', href=True):
        uri = link.get('href')

        # TODO should it inlude inner links as well?
        if not uri.startswith('http://') and not uri.startswith('https://'):
            uri = url.geturl() + uri

        if validators.url(uri):
            t = MyThread(Method='funcBrokenllinks',
                         Name='brokenlinks',
                         Url=uri)
            t.start()
            threads.append(t)

    for t in threads:
        # pdb.set_trace()
        t.join()
        # t.freemem()
        if t.getResult():
            broken_links += 1

    return broken_links
Ejemplo n.º 2
0
def getImgratio(url):

    total_img_size = 0
    threads = []

    try:
        text_size = url.getsize()
    except WebcredError as e:
        return e.message

    soup = url.getsoup()

    # total_img_size of images
    for link in soup.find_all('img', src=True):
        uri = link.get('src', None)
        if not uri.startswith('http://') and not uri.startswith('https://'):
            uri = url.geturl() + uri

        if validators.url(uri):
            try:
                uri = Urlattributes(uri)
                t = MyThread(Method='funcImgratio', Name='Imgratio', Url=uri)
                t.start()
                threads.append(t)
            except WebcredError as e:
                # even if particular image is not accessible, we don't mind it
                pass

    for t in threads:
        t.join()
        t.freemem()
        size = t.getResult()
        if isinstance(size, int):
            total_img_size += size
        # print total_img_size

    try:
        total_size = total_img_size + text_size
        ratio = float(text_size) / total_size
        # print ratio, text_size, total_size
    except ValueError:
        raise WebcredError('Error in fetching images')

    return ratio