Beispiel #1
0
def fetch_gracefully(url_byte, filename):
    """Simplified adaptation of spiderfetch.py:get_url()"""
    getter = fetch.Fetcher(url=url_byte, filename=filename)
    getter.write_progress = lambda *args, **kw: None  # no output or logging

    while True:
        try:
            getter.launch_w_tries()
            break
        except fetch.ChangedUrlWarning, e:
            url = urlrewrite.rewrite_urls(getter.url, [e.new_url]).next()
            getter.url = url
Beispiel #2
0
def fetch_gracefully(url_byte, filename):
    """Simplified adaptation of spiderfetch.py:get_url()"""
    getter = fetch.Fetcher(url=url_byte, filename=filename)
    getter.write_progress = lambda *args, **kw: None # no output or logging

    while True:
        try:
            getter.launch_w_tries()
            break
        except fetch.ChangedUrlWarning, e:
            url = urlrewrite.rewrite_urls(getter.url, [e.new_url]).next()
            getter.url = url
Beispiel #3
0
def find_urls_in_page(web, txt_byte, url_u, url_byte):
    urls_byte = []
    for u_b in spider.unbox_it_to_ss(spider.findall(txt_byte)):
        urls_byte.append(u_b)
    urls_byte = sorted(list(set(urls_byte)))

    filter_regex = get_regex_filter(url_u)

    candidates_byte = []
    for u_b in urlrewrite.rewrite_urls(url_byte, urls_byte):
        if re.match(filter_regex, u_b) and url_byte != u_b:
            if u_b not in web:
                web.add_url(u_b, [])
                candidates_byte.append(u_b)

    # if no candidate links are found, fall back on visited urls
    if len(candidates_byte) == 0:
        candidates_byte = web.urls()

    return candidates_byte
Beispiel #4
0
def find_urls_in_page(web, txt_byte, url_u, url_byte):
    urls_byte = []
    for u_b in spider.unbox_it_to_ss(spider.findall(txt_byte)):
        urls_byte.append(u_b)
    urls_byte = sorted(list(set(urls_byte)))

    filter_regex = get_regex_filter(url_u)

    candidates_byte = []
    for u_b in urlrewrite.rewrite_urls(url_byte, urls_byte):
        if re.match(filter_regex, u_b) and url_byte != u_b:
            if u_b not in web:
                web.add_url(u_b, [])
                candidates_byte.append(u_b)

    # if no candidate links are found, fall back on visited urls
    if len(candidates_byte) == 0:
        candidates_byte = web.urls()

    return candidates_byte