Beispiel #1
0
def find_urls_in_page(web, txt_byte, url_u, url_byte):
    urls_byte = []
    for u_b in spider.unbox_it_to_ss(spider.findall(txt_byte)):
        urls_byte.append(u_b)
    urls_byte = sorted(list(set(urls_byte)))

    filter_regex = get_regex_filter(url_u)

    candidates_byte = []
    for u_b in urlrewrite.rewrite_urls(url_byte, urls_byte):
        if re.match(filter_regex, u_b) and url_byte != u_b:
            if u_b not in web:
                web.add_url(u_b, [])
                candidates_byte.append(u_b)

    # if no candidate links are found, fall back on visited urls
    if len(candidates_byte) == 0:
        candidates_byte = web.urls()

    return candidates_byte
Beispiel #2
0
def find_urls_in_page(web, txt_byte, url_u, url_byte):
    urls_byte = []
    for u_b in spider.unbox_it_to_ss(spider.findall(txt_byte)):
        urls_byte.append(u_b)
    urls_byte = sorted(list(set(urls_byte)))

    filter_regex = get_regex_filter(url_u)

    candidates_byte = []
    for u_b in urlrewrite.rewrite_urls(url_byte, urls_byte):
        if re.match(filter_regex, u_b) and url_byte != u_b:
            if u_b not in web:
                web.add_url(u_b, [])
                candidates_byte.append(u_b)

    # if no candidate links are found, fall back on visited urls
    if len(candidates_byte) == 0:
        candidates_byte = web.urls()

    return candidates_byte
Beispiel #3
0
    txt_byte = get_page(url_byte)

    candidates_byte = find_urls_in_page(web, txt_byte, url_u, url_byte)
    encoding = decoder.detect_encoding(txt_byte)
    chosen_u = pick_url(candidates_byte, encoding=encoding)

    return chosen_u


if __name__ == '__main__':
    url_byte = 'http://en.wikipedia.org/wiki/Main_Page'
    url_byte = 'http://ar.wikipedia.org/wiki/الصفحة_الرئيسية'
    url_byte = 'http://pt.wikipedia.org/wiki/Casa_da_Cascata'
    url_byte = 'http://it.wikipedia.org/wiki/Special:Random'

    web = web.Web()
    web.add_url(url_byte, [])

    url_u = decoder.decode(url_byte, 'utf-8')
    depth = -1
    while depth != 0:  # easy way to set depth as infinite
        depth -= 1
        try:
            url_u = find_next(url_u, web, handler=url_handler)
        except:
            io.output("Recovering from exception:")
            io.output(traceback.format_exc())
            url_u = pick_url(web.urls())

        pause()  # less hammer
Beispiel #4
0
    txt_byte = get_page(url_byte)

    candidates_byte = find_urls_in_page(web, txt_byte, url_u, url_byte)
    encoding = decoder.detect_encoding(txt_byte)
    chosen_u = pick_url(candidates_byte, encoding=encoding)

    return chosen_u


if __name__ == '__main__':
    url_byte = 'http://en.wikipedia.org/wiki/Main_Page'
    url_byte = 'http://ar.wikipedia.org/wiki/الصفحة_الرئيسية'
    url_byte = 'http://pt.wikipedia.org/wiki/Casa_da_Cascata'
    url_byte = 'http://it.wikipedia.org/wiki/Special:Random'

    web = web.Web()
    web.add_url(url_byte, [])

    url_u = decoder.decode(url_byte, 'utf-8')
    depth = -1
    while depth != 0: # easy way to set depth as infinite
        depth -= 1
        try:
            url_u = find_next(url_u, web, handler=url_handler)
        except:
            io.output("Recovering from exception:")
            io.output(traceback.format_exc())
            url_u = pick_url(web.urls())

        pause() # less hammer