Beispiel #1
0
    def urlsplit(url, scheme='', allow_fragments=True):
        """Parse a URL into 5 components:
        <scheme>://<netloc>/<path>?<query>#<fragment>
        Return a 5-tuple: (scheme, netloc, path, query, fragment).
        Note that we don't break the components up in smaller bits
        (e.g. netloc is a single string) and we don't expand % escapes."""
        allow_fragments = bool(allow_fragments)
        key = url, scheme, allow_fragments, type(url), type(scheme)
        cached = _parse_cache.get(key, None)
        if cached:
            return cached
        if len(_parse_cache) >= MAX_CACHE_SIZE:  # avoid runaway growth
            clear_cache()
        netloc = query = fragment = ''
        i = url.find(':')
        if i > 0:
            if url[:i] == 'http':  # optimize the common case
                scheme = url[:i].lower()
                url = url[i + 1:]
                if url[:2] == '//':
                    netloc, url = _splitnetloc(url, 2)
                    if (('[' in netloc and ']' not in netloc)
                            or (']' in netloc and '[' not in netloc)):
                        raise ValueError("Invalid IPv6 URL")
                if allow_fragments and '#' in url:
                    url, fragment = url.split('#', 1)
                if '?' in url:
                    url, query = url.split('?', 1)
                v = SplitResult(scheme, netloc, url, query, fragment)
                _parse_cache[key] = v
                return v
            for c in url[:i]:
                if c not in scheme_chars:
                    break
            else:
                # make sure "url" is not actually a port number (in which case
                # "scheme" is really part of the path)
                rest = url[i + 1:]
                if not rest or any(c not in '0123456789' for c in rest):
                    # not a port number
                    scheme, url = url[:i].lower(), rest

        if url[:2] == '//':
            netloc, url = _splitnetloc(url, 2)
            if (('[' in netloc and ']' not in netloc)
                    or (']' in netloc and '[' not in netloc)):
                raise ValueError("Invalid IPv6 URL")
        if allow_fragments and '#' in url:
            url, fragment = url.split('#', 1)
        if '?' in url:
            url, query = url.split('?', 1)
        v = SplitResult(scheme, netloc, url, query, fragment)
        _parse_cache[key] = v
        return v
Beispiel #2
0
def url_parse(url):
    clear_cache()
    urlparse(url)
Beispiel #3
0



URLS_X = URLS + ['ftp://freebsd.org/releases/5.8']

if __name__ == "__main__":
    url_grammar = recover_grammar(url_parse, URLS_X, files=['urllib/parse.py'])


if __name__ == "__main__":
    syntax_diagram(url_grammar)


if __name__ == "__main__":
    clear_cache()
    with Tracer(URLS_X[0]) as tracer:
        urlparse(tracer.my_input)
    for i, t in enumerate(tracer.trace):
        if t[0] in {'call', 'line'} and 'parse.py' in str(t[2]) and t[3]:
            print(i, t[2]._t()[1], t[3:])


# ## Grammar Miner with Reassignment

if __name__ == "__main__":
    print('\n## Grammar Miner with Reassignment')



Beispiel #4
0
if len(sys.argv) == 1:
    print('Usage: time_urlparse_file <filename>')
    exit(0)

filename = sys.argv[1]
total_url_count = 0

total_urllib = 0
total_f = 0
total_fc = 0
total_fcb = 0

curlparse.clear_cache()
urlparse_fast.clear_cache()
urlparse_urllib.clear_cache()

start_all = time.time()
for url in open(filename, 'r'):
    url_bytes = url.encode('utf-8')
    total_url_count += 1

    start = time.time()
    urlparse_fast.urlparse(url)
    total_f += time.time() - start

    start = time.time()
    curlparse.urlparse(url)
    total_fc += time.time() - start

    start = time.time()