Python find_urls_for_xml Examples, planemo.shed.find_urls_for_xml Python Examples

Example #1

0

Show file

File: lint.py Project: stevecassidy/planemo

def lint_urls(root, lint_ctx):
    """Find referenced URLs and verify they are valid."""
    urls, docs = find_urls_for_xml(root)

    # This is from Google Chome 53.0.2785.143, current at time of writing:
    BROWSER_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36"

    def validate_url(url, lint_ctx, user_agent=None):
        is_valid = True
        if user_agent:
            req = Request(url, headers={"User-Agent": user_agent})
        else:
            req = url
        try:
            handle = urlopen(req)
            handle.read(100)
        except HTTPError as e:
            if e.code == 429:
                # too many requests
                pass
            else:
                is_valid = False
                lint_ctx.error("HTTP Error %s accessing %s" % (e.code, url))
        except URLError as e:
            is_valid = False
            lint_ctx.error("URL Error %s accessing %s" % (str(e), url))
        if is_valid:
            lint_ctx.info("URL OK %s" % url)

    for url in urls:
        validate_url(url, lint_ctx)
    for url in docs:
        validate_url(url, lint_ctx, BROWSER_USER_AGENT)

Example #2

0

Show file

File: lint.py Project: gregvonkuster/planemo

def lint_urls(root, lint_ctx):
    """Find referenced URLs and verify they are valid."""
    urls, docs = find_urls_for_xml(root)

    # This is from Google Chome 53.0.2785.143, current at time of writing:
    BROWSER_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36"

    def validate_url(url, lint_ctx, user_agent=None):
        is_valid = True
        if user_agent:
            req = Request(url, headers={"User-Agent": user_agent})
        else:
            req = url
        try:
            handle = urlopen(req)
            handle.read(100)
        except HTTPError as e:
            if e.code == 429:
                # too many requests
                pass
            else:
                is_valid = False
                lint_ctx.error("HTTP Error %s accessing %s" % (e.code, url))
        except URLError as e:
            is_valid = False
            lint_ctx.error("URL Error %s accessing %s" % (str(e), url))
        if is_valid:
            lint_ctx.info("URL OK %s" % url)

    for url in urls:
        validate_url(url, lint_ctx)
    for url in docs:
        validate_url(url, lint_ctx, BROWSER_USER_AGENT)

Example #3

0

Show file

File: lint.py Project: pierrickrogermele/planemo

def lint_urls(root, lint_ctx):
    urls = find_urls_for_xml(root)

    def validate_url(url, lint_ctx):
        try:
            handle = urllib2.urlopen(url)
            handle.read(100)
            lint_ctx.info("URL OK %s" % url)
        except urllib2.HTTPError as e:
            lint_ctx.error("HTTP Error %s accessing %s" % (e.code, url))
        except urllib2.URLError as e:
            lint_ctx.error("URL Error %s accessing %s" % (str(e), url))

    for url in urls:
        validate_url(url, lint_ctx)

Example #4

0

Show file

def lint_urls(root, lint_ctx):
    """Find referenced URLs and verify they are valid."""
    urls, docs = find_urls_for_xml(root)

    # This is from Google Chome on macOS, current at time of writing:
    BROWSER_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"

    def validate_url(url, lint_ctx, user_agent=None):
        is_valid = True
        if url.startswith('http://') or url.startswith('https://'):
            if user_agent:
                headers = {"User-Agent": user_agent, 'Accept': '*/*'}
            else:
                headers = None
            r = None
            try:
                r = requests.get(url, headers=headers, stream=True)
                r.raise_for_status()
                next(r.iter_content(1000))
            except Exception as e:
                if r is not None and r.status_code == 429:
                    # too many requests
                    pass
                if r is not None and r.status_code == 403 and 'cloudflare' in r.text:
                    # CloudFlare protection block
                    pass
                else:
                    is_valid = False
                    lint_ctx.error("Error '%s' accessing %s" % (e, url))
        else:
            try:
                with urlopen(url) as handle:
                    handle.read(100)
            except Exception as e:
                is_valid = False
                lint_ctx.error("Error '%s' accessing %s" % (e, url))
        if is_valid:
            lint_ctx.info("URL OK %s" % url)

    for url in urls:
        validate_url(url, lint_ctx)
    for url in docs:
        validate_url(url, lint_ctx, BROWSER_USER_AGENT)

Example #5

0

Show file

File: lint.py Project: martenson/planemo

def lint_urls(root, lint_ctx):
    """Find referenced URLs and verify they are valid."""
    urls, docs = find_urls_for_xml(root)

    # This is from Google Chome 53.0.2785.143, current at time of writing:
    BROWSER_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36"

    def validate_url(url, lint_ctx, user_agent=None):
        is_valid = True
        if url.startswith('http://') or url.startswith('https://'):
            if user_agent:
                headers = {"User-Agent": user_agent, 'Accept': '*/*'}
            else:
                headers = None
            r = None
            try:
                r = requests.get(url, headers=headers, stream=True)
                r.raise_for_status()
                next(r.iter_content(1000))
            except Exception as e:
                if r and r.status_code == 429:
                    # too many requests
                    pass
                else:
                    is_valid = False
                    lint_ctx.error("Error '%s' accessing %s" % (e, url))
        else:
            try:
                handle = urlopen(url)
                handle.read(100)
            except Exception as e:
                is_valid = False
                lint_ctx.error("Error '%s' accessing %s" % (e, url))
        if is_valid:
            lint_ctx.info("URL OK %s" % url)

    for url in urls:
        validate_url(url, lint_ctx)
    for url in docs:
        validate_url(url, lint_ctx, BROWSER_USER_AGENT)