Example #1
0
def get_url(fetcher, wb, host_filter=False):
    """http 30x redirects produce a recursion with new urls that may or may not
    have been seen before"""
    while True:
        try:
            fetcher.launch_w_tries()
            break
        except fetch.ChangedUrlWarning, e:
            url = urlrewrite.rewrite_urls(fetcher.url, [e.new_url]).next()
            if url in wb:
                raise fetch.DuplicateUrlWarning
            if not recipe.apply_hostfilter(host_filter, url):
                raise fetch.UrlRedirectsOffHost
            wb.add_ref(fetcher.url, url)
            fetcher.url = url
Example #2
0
def get_url(fetcher, wb, host_filter=False):
    """http 30x redirects produce a recursion with new urls that may or may not
    have been seen before"""
    while True:
        try:
            fetcher.launch_w_tries()
            break
        except fetch.ChangedUrlWarning, e:
            url = urlrewrite.rewrite_urls(fetcher.url, [e.new_url]).next()
            if url in wb:
                raise fetch.DuplicateUrlWarning
            if not recipe.apply_hostfilter(host_filter, url):
                raise fetch.UrlRedirectsOffHost
            wb.add_ref(fetcher.url, url)
            fetcher.url = url
Example #3
0
def process_records(queue, rule, wb):
    newqueue = []
    for record in queue:
        maybesave(wb, queue)

        url = record.get("url")
        try:
            (fp, filename) = io.get_tempfile()
            f = fetch.Fetcher(mode=record.get("mode"),
                              url=url,
                              filename=filename)
            url = get_url(f, wb, host_filter=rule.get("host_filter"))
            filename = f.filename

            # consider retrying the fetch if it failed
            if f.error and fetch.err.is_temporal(f.error):
                if not record.get("retry"):
                    record["retry"] = True
                    queue.append(record)

            if record.get("mode") == fetch.Fetcher.SPIDER:
                data = open(filename, 'r').read()
                urls = spider.unbox_it_to_ss(spider.findall(data, url))
                urls = urlrewrite.rewrite_urls(url, urls)

                (newqueue, wb) = qualify_urls(url, urls, rule, newqueue, wb)

            if record.get("mode") == fetch.Fetcher.FETCH:
                shutil.move(filename,
                            io.safe_filename(urlrewrite.url_to_filename(url)))

        except (fetch.DuplicateUrlWarning, fetch.UrlRedirectsOffHost):
            pass
        except KeyboardInterrupt:
            q = queue[queue.index(record):]
            q.extend(newqueue)
            save_session(wb, queue=q)
            sys.exit(1)
        except Exception, exc:
            log_exc(exc, url, wb)
        finally:
Example #4
0
def process_records(queue, rule, wb):
    newqueue = []
    for record in queue:
        maybesave(wb, queue)

        url = record.get("url")
        try:
            (fp, filename) = io.get_tempfile()
            f = fetch.Fetcher(mode=record.get("mode"), url=url, filename=filename)
            url = get_url(f, wb, host_filter=rule.get("host_filter"))
            filename = f.filename

            # consider retrying the fetch if it failed
            if f.error and fetch.err.is_temporal(f.error):
                if not record.get("retry"):
                    record["retry"] = True
                    queue.append(record)

            if record.get("mode") == fetch.Fetcher.SPIDER:
                data = open(filename, 'r').read()
                urls = spider.unbox_it_to_ss(spider.findall(data, url))
                urls = urlrewrite.rewrite_urls(url, urls)

                (newqueue, wb) = qualify_urls(url, urls, rule, newqueue, wb)

            if record.get("mode") == fetch.Fetcher.FETCH:
                shutil.move(filename,
                  io.safe_filename(urlrewrite.url_to_filename(url)))

        except (fetch.DuplicateUrlWarning, fetch.UrlRedirectsOffHost):
            pass
        except KeyboardInterrupt:
            q = queue[queue.index(record):]
            q.extend(newqueue)
            save_session(wb, queue=q)
            sys.exit(1)
        except Exception, exc:
            log_exc(exc, url, wb)
        finally: