Ejemplo n.º 1
0
def qualify_urls(ref_url, urls, rule, newqueue, wb):
    for url in urls:
        _dump, _fetch, _spider = False, False, False

        # apply patterns to determine how to qualify url
        if recipe.apply_mask(rule.get("dump"), url):
            _dump = True
        if recipe.apply_mask(rule.get("fetch"), url):
            _fetch = True
        if (recipe.apply_mask(rule.get("spider"), url) and
            recipe.apply_hostfilter(rule.get("host_filter"), url)):
            _spider = True

        # build a record based on qualification
        record = {"url" : url}
        if url not in wb:
            if _dump:
                io.write_out("%s\n" % url)
            if _fetch and _spider:
                record["mode"] = fetch.Fetcher.SPIDER_FETCH
            elif _fetch:
                record["mode"] = fetch.Fetcher.FETCH
            elif _spider:
                record["mode"] = fetch.Fetcher.SPIDER

            if _fetch or _spider:
                newqueue.append(record)

        # add url to web if it was matched by anything
        if _dump or _fetch or _spider:
            wb.add_url(ref_url, [url])

    return newqueue, wb
Ejemplo n.º 2
0
def qualify_urls(ref_url, urls, rule, newqueue, wb):
    for url in urls:
        _dump, _fetch, _spider = False, False, False

        # apply patterns to determine how to qualify url
        if recipe.apply_mask(rule.get("dump"), url):
            _dump = True
        if recipe.apply_mask(rule.get("fetch"), url):
            _fetch = True
        if (recipe.apply_mask(rule.get("spider"), url)
                and recipe.apply_hostfilter(rule.get("host_filter"), url)):
            _spider = True

        # build a record based on qualification
        record = {"url": url}
        if url not in wb:
            if _dump:
                io.write_out("%s\n" % url)
            if _fetch and _spider:
                record["mode"] = fetch.Fetcher.SPIDER_FETCH
            elif _fetch:
                record["mode"] = fetch.Fetcher.FETCH
            elif _spider:
                record["mode"] = fetch.Fetcher.SPIDER

            if _fetch or _spider:
                newqueue.append(record)

        # add url to web if it was matched by anything
        if _dump or _fetch or _spider:
            wb.add_url(ref_url, [url])

    return newqueue, wb
Ejemplo n.º 3
0
def get_url(fetcher, wb, host_filter=False):
    """http 30x redirects produce a recursion with new urls that may or may not
    have been seen before"""
    while True:
        try:
            fetcher.launch_w_tries()
            break
        except fetch.ChangedUrlWarning, e:
            url = urlrewrite.rewrite_urls(fetcher.url, [e.new_url]).next()
            if url in wb:
                raise fetch.DuplicateUrlWarning
            if not recipe.apply_hostfilter(host_filter, url):
                raise fetch.UrlRedirectsOffHost
            wb.add_ref(fetcher.url, url)
            fetcher.url = url
Ejemplo n.º 4
0
def get_url(fetcher, wb, host_filter=False):
    """http 30x redirects produce a recursion with new urls that may or may not
    have been seen before"""
    while True:
        try:
            fetcher.launch_w_tries()
            break
        except fetch.ChangedUrlWarning, e:
            url = urlrewrite.rewrite_urls(fetcher.url, [e.new_url]).next()
            if url in wb:
                raise fetch.DuplicateUrlWarning
            if not recipe.apply_hostfilter(host_filter, url):
                raise fetch.UrlRedirectsOffHost
            wb.add_ref(fetcher.url, url)
            fetcher.url = url