Exemple #1
0
def dash_R_cleanup(fs, ps, pic):
    import gc, copy_reg
    import _strptime, linecache, dircache
    import urlparse, urllib, urllib2, mimetypes, doctest
    import struct, filecmp
    from distutils.dir_util import _path_created

    # Restore some original values.
    warnings.filters[:] = fs
    copy_reg.dispatch_table.clear()
    copy_reg.dispatch_table.update(ps)
    sys.path_importer_cache.clear()
    sys.path_importer_cache.update(pic)

    # Clear assorted module caches.
    _path_created.clear()
    re.purge()
    _strptime._regex_cache.clear()
    urlparse.clear_cache()
    urllib.urlcleanup()
    urllib2.install_opener(None)
    dircache.reset()
    linecache.clearcache()
    mimetypes._default_mime_types()
    struct._cache.clear()
    filecmp._cache.clear()
    doctest.master = None

    # Collect cyclic trash.
    gc.collect()
def custom_scheme_redirect(url_redirect):
    # urlparse.urlsplit doesn't currently handle custom schemes,
    # which we want our callback URLs to support so mobile apps can register
    # their own callback scheme handlers.
    # See http://bugs.python.org/issue9374
    # and http://stackoverflow.com/questions/1417958/parse-custom-uris-with-urlparse-python

    scheme = urlparse.urlsplit(url_redirect)[0]

    scheme_lists = [urlparse.uses_netloc, urlparse.uses_query, urlparse.uses_fragment, urlparse.uses_params, urlparse.uses_relative]
    scheme_lists_modified = []

    # Modify urlparse's internal scheme lists so it properly handles custom schemes
    if scheme:
        for scheme_list in scheme_lists:
            if scheme not in scheme_list:
                scheme_list.append(scheme)
                scheme_lists_modified.append(scheme_list)

    # Clear cache before re-parsing url_redirect
    urlparse.clear_cache()

    # Grab flask/werkzeug redirect result
    redirect_result = redirect(url_redirect)

    # Restore previous urlparse scheme list
    for scheme_list in scheme_lists_modified:
        scheme_list.remove(scheme)

    return redirect_result
Exemple #3
0
def dash_R_cleanup(fs, ps, pic):
    import gc, copy_reg
    import _strptime, linecache, dircache
    import urlparse, urllib, urllib2, mimetypes, doctest
    import struct, filecmp
    from distutils.dir_util import _path_created

    # Restore some original values.
    warnings.filters[:] = fs
    copy_reg.dispatch_table.clear()
    copy_reg.dispatch_table.update(ps)
    sys.path_importer_cache.clear()
    sys.path_importer_cache.update(pic)

    # Clear assorted module caches.
    _path_created.clear()
    re.purge()
    _strptime._regex_cache.clear()
    urlparse.clear_cache()
    urllib.urlcleanup()
    urllib2.install_opener(None)
    dircache.reset()
    linecache.clearcache()
    mimetypes._default_mime_types()
    struct._cache.clear()
    filecmp._cache.clear()
    doctest.master = None

    # Collect cyclic trash.
    gc.collect()
def custom_scheme_redirect(url_redirect):
    # urlparse.urlsplit doesn't currently handle custom schemes,
    # which we want our callback URLs to support so mobile apps can register
    # their own callback scheme handlers.
    # See http://bugs.python.org/issue9374
    # and http://stackoverflow.com/questions/1417958/parse-custom-uris-with-urlparse-python

    scheme = urlparse.urlsplit(url_redirect)[0]

    scheme_lists = [
        urlparse.uses_netloc, urlparse.uses_query, urlparse.uses_fragment,
        urlparse.uses_params, urlparse.uses_relative
    ]
    scheme_lists_modified = []

    # Modify urlparse's internal scheme lists so it properly handles custom schemes
    if scheme:
        for scheme_list in scheme_lists:
            if scheme not in scheme_list:
                scheme_list.append(scheme)
                scheme_lists_modified.append(scheme_list)

    # Clear cache before re-parsing url_redirect
    urlparse.clear_cache()

    # Grab flask/werkzeug redirect result
    redirect_result = redirect(url_redirect)

    # Restore previous urlparse scheme list
    for scheme_list in scheme_lists_modified:
        scheme_list.remove(scheme)

    return redirect_result
Exemple #5
0
    def url_is_acceptable(self, url):
        parsed = urlparse.urlparse(url)

        # Work-around a nasty bug. urlparse() caches parsed results and returns them on future calls,
        # and if the cache isn't cleared here, then a unicode string gets added to the cache, which
        # freaks out cherrypy when it independently calls urlparse() with the same URL later.
        urlparse.clear_cache()

        return parsed[0] in self.allowed_schemes
Exemple #6
0
  def url_is_acceptable(self,url):
    parsed = urlparse.urlparse(url)

    # Work-around a nasty bug. urlparse() caches parsed results and returns them on future calls,
    # and if the cache isn't cleared here, then a unicode string gets added to the cache, which
    # freaks out cherrypy when it independently calls urlparse() with the same URL later.
    urlparse.clear_cache()

    return parsed[0] in self.allowed_schemes
Exemple #7
0
    def urlsplit(url, scheme="", allow_fragments=True):
        """Parse a URL into 5 components:
        <scheme>://<netloc>/<path>?<query>#<fragment>
        Return a 5-tuple: (scheme, netloc, path, query, fragment).
        Note that we don't break the components up in smaller bits
        (e.g. netloc is a single string) and we don't expand % escapes."""
        allow_fragments = bool(allow_fragments)
        key = url, scheme, allow_fragments, type(url), type(scheme)
        cached = _parse_cache.get(key, None)
        if cached:
            return cached
        if len(_parse_cache) >= MAX_CACHE_SIZE:  # avoid runaway growth
            clear_cache()
        netloc = query = fragment = ""
        i = url.find(":")
        if i > 0:
            if url[:i] == "http":  # optimize the common case
                scheme = url[:i].lower()
                url = url[i + 1 :]
                if url[:2] == "//":
                    netloc, url = _splitnetloc(url, 2)
                    if ("[" in netloc and "]" not in netloc) or (
                        "]" in netloc and "[" not in netloc
                    ):
                        raise ValueError("Invalid IPv6 URL")
                if allow_fragments and "#" in url:
                    url, fragment = url.split("#", 1)
                if "?" in url:
                    url, query = url.split("?", 1)
                v = SplitResult(scheme, netloc, url, query, fragment)
                _parse_cache[key] = v
                return v
            for c in url[:i]:
                if c not in scheme_chars:
                    break
            else:
                # make sure "url" is not actually a port number (in which case
                # "scheme" is really part of the path)
                rest = url[i + 1 :]
                if not rest or any(c not in "0123456789" for c in rest):
                    # not a port number
                    scheme, url = url[:i].lower(), rest

        if url[:2] == "//":
            netloc, url = _splitnetloc(url, 2)
            if ("[" in netloc and "]" not in netloc) or (
                "]" in netloc and "[" not in netloc
            ):
                raise ValueError("Invalid IPv6 URL")
        if allow_fragments and "#" in url:
            url, fragment = url.split("#", 1)
        if "?" in url:
            url, query = url.split("?", 1)
        v = SplitResult(scheme, netloc, url, query, fragment)
        _parse_cache[key] = v
        return v
Exemple #8
0
    def urlsplit(url, scheme='', allow_fragments=True):
        """Parse a URL into 5 components:
        <scheme>://<netloc>/<path>?<query>#<fragment>
        Return a 5-tuple: (scheme, netloc, path, query, fragment).
        Note that we don't break the components up in smaller bits
        (e.g. netloc is a single string) and we don't expand % escapes."""
        allow_fragments = bool(allow_fragments)
        key = url, scheme, allow_fragments, type(url), type(scheme)
        cached = _parse_cache.get(key, None)
        if cached:
            return cached
        if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
            clear_cache()
        netloc = query = fragment = ''
        i = url.find(':')
        if i > 0:
            if url[:i] == 'http': # optimize the common case
                scheme = url[:i].lower()
                url = url[i+1:]
                if url[:2] == '//':
                    netloc, url = _splitnetloc(url, 2)
                    if (('[' in netloc and ']' not in netloc) or
                            (']' in netloc and '[' not in netloc)):
                        raise ValueError("Invalid IPv6 URL")
                if allow_fragments and '#' in url:
                    url, fragment = url.split('#', 1)
                if '?' in url:
                    url, query = url.split('?', 1)
                v = SplitResult(scheme, netloc, url, query, fragment)
                _parse_cache[key] = v
                return v
            for c in url[:i]:
                if c not in scheme_chars:
                    break
            else:
                # make sure "url" is not actually a port number (in which case
                # "scheme" is really part of the path)
                rest = url[i+1:]
                if not rest or any(c not in '0123456789' for c in rest):
                    # not a port number
                    scheme, url = url[:i].lower(), rest

        if url[:2] == '//':
            netloc, url = _splitnetloc(url, 2)
            if (('[' in netloc and ']' not in netloc) or
                    (']' in netloc and '[' not in netloc)):
                raise ValueError("Invalid IPv6 URL")
        if allow_fragments and '#' in url:
            url, fragment = url.split('#', 1)
        if '?' in url:
            url, query = url.split('?', 1)
        v = SplitResult(scheme, netloc, url, query, fragment)
        _parse_cache[key] = v
        return v
def dash_R_cleanup(fs, ps, pic, zdc, abcs):
    import gc, copy_reg
    import _strptime, linecache
    dircache = test_support.import_module('dircache', deprecated=True)
    import urlparse, urllib, urllib2, mimetypes, doctest
    import struct, filecmp
    from distutils.dir_util import _path_created

    # Clear the warnings registry, so they can be displayed again
    for mod in sys.modules.values():
        if hasattr(mod, '__warningregistry__'):
            del mod.__warningregistry__

    # Restore some original values.
    warnings.filters[:] = fs
    copy_reg.dispatch_table.clear()
    copy_reg.dispatch_table.update(ps)
    sys.path_importer_cache.clear()
    sys.path_importer_cache.update(pic)
    try:
        import zipimport
    except ImportError:
        pass # Run unmodified on platforms without zipimport support
    else:
        zipimport._zip_directory_cache.clear()
        zipimport._zip_directory_cache.update(zdc)

    # clear type cache
    sys._clear_type_cache()

    # Clear ABC registries, restoring previously saved ABC registries.
    for abc, registry in abcs.items():
        abc._abc_registry = registry.copy()
        abc._abc_cache.clear()
        abc._abc_negative_cache.clear()

    # Clear assorted module caches.
    _path_created.clear()
    re.purge()
    _strptime._regex_cache.clear()
    urlparse.clear_cache()
    urllib.urlcleanup()
    urllib2.install_opener(None)
    dircache.reset()
    linecache.clearcache()
    mimetypes._default_mime_types()
    filecmp._cache.clear()
    struct._clearcache()
    doctest.master = None

    # Collect cyclic trash.
    gc.collect()
Exemple #10
0
def dash_R_cleanup(fs, ps, pic, abcs):
    import gc, copy_reg
    import _strptime, linecache
    dircache = test_support.import_module('dircache', deprecated=True)
    import urlparse, urllib, urllib2, mimetypes, doctest
    import struct, filecmp
    from distutils.dir_util import _path_created

    # Clear the warnings registry, so they can be displayed again
    for mod in sys.modules.values():
        if hasattr(mod, '__warningregistry__'):
            del mod.__warningregistry__

    # Restore some original values.
    warnings.filters[:] = fs
    copy_reg.dispatch_table.clear()
    copy_reg.dispatch_table.update(ps)
    sys.path_importer_cache.clear()
    sys.path_importer_cache.update(pic)

    # clear type cache
    sys._clear_type_cache()

    # Clear ABC registries, restoring previously saved ABC registries.
    for abc, registry in abcs.items():
        abc._abc_registry = registry.copy()
        abc._abc_cache.clear()
        abc._abc_negative_cache.clear()

    # Clear assorted module caches.
    _path_created.clear()
    re.purge()
    _strptime._regex_cache.clear()
    urlparse.clear_cache()
    urllib.urlcleanup()
    urllib2.install_opener(None)
    dircache.reset()
    linecache.clearcache()
    mimetypes._default_mime_types()
    filecmp._cache.clear()
    struct._clearcache()
    doctest.master = None

    if _llvm:
        code_types = (types.CodeType, types.FunctionType, types.MethodType)
        for obj in gc.get_objects():
            if isinstance(obj, code_types):
                _llvm.clear_feedback(obj)

    # Collect cyclic trash.
    gc.collect()
Exemple #11
0
def dash_R_cleanup(fs, ps, pic, abcs):
    import gc, copy_reg
    import _strptime, linecache
    dircache = test_support.import_module('dircache', deprecated=True)
    import urlparse, urllib, urllib2, mimetypes, doctest
    import struct, filecmp
    from distutils.dir_util import _path_created

    # Clear the warnings registry, so they can be displayed again
    for mod in sys.modules.values():
        if hasattr(mod, '__warningregistry__'):
            del mod.__warningregistry__

    # Restore some original values.
    warnings.filters[:] = fs
    copy_reg.dispatch_table.clear()
    copy_reg.dispatch_table.update(ps)
    sys.path_importer_cache.clear()
    sys.path_importer_cache.update(pic)

    # clear type cache
    sys._clear_type_cache()

    # Clear ABC registries, restoring previously saved ABC registries.
    for abc, registry in abcs.items():
        abc._abc_registry = registry.copy()
        abc._abc_cache.clear()
        abc._abc_negative_cache.clear()

    # Clear assorted module caches.
    _path_created.clear()
    re.purge()
    _strptime._regex_cache.clear()
    urlparse.clear_cache()
    urllib.urlcleanup()
    urllib2.install_opener(None)
    dircache.reset()
    linecache.clearcache()
    mimetypes._default_mime_types()
    filecmp._cache.clear()
    struct._clearcache()
    doctest.master = None

    if _llvm:
        code_types = (types.CodeType, types.FunctionType, types.MethodType)
        for obj in gc.get_objects():
            if isinstance(obj, code_types):
                _llvm.clear_feedback(obj)

    # Collect cyclic trash.
    gc.collect()
Exemple #12
0
def _safe_urlsplit(s):
    """the urlparse.urlsplit cache breaks if it contains unicode and
    we cannot control that.  So we force type cast that thing back
    to what we think it is.
    """
    rv = urlparse.urlsplit(s)
    # we have to check rv[2] here and not rv[1] as rv[1] will be
    # an empty bytestring in case no domain was given.
    if type(rv[2]) is not type(s):
        assert hasattr(urlparse, 'clear_cache')
        urlparse.clear_cache()
        rv = urlparse.urlsplit(s)
        assert type(rv[2]) is type(s)
    return rv
Exemple #13
0
def _safe_urlsplit(s):
    """the urlparse.urlsplit cache breaks if it contains unicode and
    we cannot control that.  So we force type cast that thing back
    to what we think it is.
    """
    rv = urlparse.urlsplit(s)
    # we have to check rv[2] here and not rv[1] as rv[1] will be
    # an empty bytestring in case no domain was given.
    if type(rv[2]) is not type(s):
        assert hasattr(urlparse, 'clear_cache')
        urlparse.clear_cache()
        rv = urlparse.urlsplit(s)
        assert type(rv[2]) is type(s)
    return rv
Exemple #14
0
 def cleanup():
     import _strptime, urlparse, warnings, dircache
     from distutils.dir_util import _path_created
     _path_created.clear()
     warnings.filters[:] = fs
     gc.collect()
     sre.purge()
     _strptime._regex_cache.clear()
     urlparse.clear_cache()
     copy_reg.dispatch_table.clear()
     copy_reg.dispatch_table.update(ps)
     sys.path_importer_cache.clear()
     sys.path_importer_cache.update(pic)
     dircache.reset()
Exemple #15
0
    def trace_memory_clean_caches(self):
        """ Avoid polluting results with some builtin python caches """

        urlparse.clear_cache()
        re.purge()
        linecache.clearcache()
        copy_reg.clear_extension_cache()

        if hasattr(fnmatch, "purge"):
            fnmatch.purge()  # pylint: disable=no-member
        elif hasattr(fnmatch, "_purge"):
            fnmatch._purge()

        if hasattr(encodings, "_cache") and len(encodings._cache) > 0:
            encodings._cache = {}

        context.log.handler.flush()
Exemple #16
0
    def trace_memory_clean_caches(self):
        """ Avoid polluting results with some builtin python caches """

        urlparse.clear_cache()
        re.purge()
        linecache.clearcache()
        copy_reg.clear_extension_cache()

        if hasattr(fnmatch, "purge"):
            fnmatch.purge()  # pylint: disable=no-member
        elif hasattr(fnmatch, "_purge"):
            fnmatch._purge()

        if hasattr(encodings, "_cache") and len(encodings._cache) > 0:
            encodings._cache = {}

        context.log.handler.flush()
Exemple #17
0
    def test_urlparse(self):
        """
        For a given URL, L{http.urlparse} should behave the same as
        L{urlparse}, except it should always return C{str}, never C{unicode}.
        """
        def urls():
            for scheme in ('http', 'https'):
                for host in ('example.com',):
                    for port in (None, 100):
                        for path in ('', 'path'):
                            if port is not None:
                                host = host + ':' + str(port)
                                yield urlunsplit((scheme, host, path, '', ''))


        def assertSameParsing(url, decode):
            """
            Verify that C{url} is parsed into the same objects by both
            L{http.urlparse} and L{urlparse}.
            """
            urlToStandardImplementation = url
            if decode:
                urlToStandardImplementation = url.decode('ascii')
            standardResult = urlparse(urlToStandardImplementation)
            scheme, netloc, path, params, query, fragment = http.urlparse(url)
            self.assertEqual(
                (scheme, netloc, path, params, query, fragment),
                standardResult)
            self.assertTrue(isinstance(scheme, str))
            self.assertTrue(isinstance(netloc, str))
            self.assertTrue(isinstance(path, str))
            self.assertTrue(isinstance(params, str))
            self.assertTrue(isinstance(query, str))
            self.assertTrue(isinstance(fragment, str))

        # With caching, unicode then str
        clear_cache()
        for url in urls():
            assertSameParsing(url, True)
            assertSameParsing(url, False)

        # With caching, str then unicode
        clear_cache()
        for url in urls():
            assertSameParsing(url, False)
            assertSameParsing(url, True)

        # Without caching
        for url in urls():
            clear_cache()
            assertSameParsing(url, True)
            clear_cache()
            assertSameParsing(url, False)
def processURL(hypeIndex, urlToGet, domains, id):

    def isExtensionOkay(u):

        if u.endswith('.htm') or u.endswith('.html'):
            return True
        else:
            return False


    def getIndexableContent(soup):
        contents = []

        allTags = soup.findAll(id='body')
        soup = BeautifulSoup(str(allTags[0]))
        allTags = soup.findAll()

        # Try and find the indexable contents
        for tag in allTags:
            for item in tag.contents:
                # Looking for leaf nodes
                if not hasattr(item, 'contents'):
                    if item.__class__ == NavigableString:
                        content = str(item).strip()
                        if content:
                            contents.append(content)

        contents = " ".join([str(s) for s in contents])
        contents = re.sub(entityRE, "", contents)
        return contents


    def getTitle(soup):
        title = soup.find('title')
        if title:
            return title.string
        else:
            return ''


    def getLinkedPages(soup, u, domains):
        newPaths = []
        anchors = soup.findAll('a')
        for a in anchors:
            try:
                href = a['href']
            except KeyError:
                continue

            scheme, host, port, path = my_parse(href)

            if scheme in ('http', 'https', '') and host in domains:
                if path == '' or path[0] != '/':
                    # relative path
                    pathList = u.pathList()[:-1] 
                    currpath = '/'.join(pathList) 
                    if currpath:
                        currpath = '/' + currpath
                    path = currpath + '/' + path
                    path = n_url.normURLPath(path)

                args = n_url.URL.fromString(path).queryList()
                path = '/'+'/'.join(n_url.URL.fromString(path).pathList())
                query = ''
                for arg in args: 
                    if arg[0] in ['page']:
                       query = '?page=%s'%arg[1]
                path = path.encode('ascii')
                path = urllib.quote(path)+query.encode('ascii')
                newPaths.append(path)
            else:
#                print '** Ignore', href
                pass

        return newPaths


    def getSectionAndSummary(soup):
        if id is None:
            return 'any', ''
        summary = soup.findAll('div', attrs={'id':id})
        text = summary[0].findAll(lambda tag: hasattr(tag,'string') and tag.string is not None)
        #for t in text:
            #if t.name in ['h1','h2','h3','h4','strong']:
                #print '***',t.string
            #else:
                #print '---',t.string
            
                    
        if text:
            summary = ' .'.join( [t.string for t in text] )
            section = 'any'
            summary = re.sub( '\s+', ' ', summary)
            #print 'storing', section, ',',summary
            return section, summary[:300]

        return 'any', ''


    def gotPage(page, factory):

        u = n_url.URL.fromString(factory.url)

        if not page.startswith('<!DOCTYPE'):
            # Don't like the look of this url so I won't try and process it
            return factory.url, []
        soup = BeautifulSoup(page)
        title = getTitle(soup)
        content = getIndexableContent(soup)
        newPaths = getLinkedPages(soup, u, domains)
        section, summary = getSectionAndSummary(soup)

        #print '****'
        #print '>> URL', factory.url
        #print '>> content', content

        args = u.queryList()
        query = ''
        for arg in args: 
            if arg[0] in ['page']:
                query = '?page=%s'%arg[1]
        key = '/' + '/'.join(u.pathList()) + query

        if query == '':
            hypeIndex.addDocument(key, title, section, summary, content)

        return key, newPaths
                


    urlparse.clear_cache()
    factory = getPage(urlToGet)
    d = factory.deferred
    d.addCallback(gotPage, factory)
    return d
Exemple #19
0
import urlparse

import jsonrpclib

jsonrpclib.config.version = 1.0
import jsonrpclib.SimpleJSONRPCServer
import web

from decoder import Mark4, Mark5B
import scp
import config

urlparse.uses_relative.append('scp')
urlparse.uses_netloc.append('scp')
urlparse.uses_params.append('scp')
urlparse.clear_cache()


def vex2time(str):
    tupletime = time.strptime(str, "%Yy%jd%Hh%Mm%Ss")
    return time.mktime(tupletime)


def time2vex(secs):
    tupletime = time.gmtime(secs)
    return time.strftime("%Yy%jd%Hh%Mm%Ss", tupletime)


os.environ['TZ'] = 'UTC'
time.tzset()
Exemple #20
0

if len(sys.argv) == 1:
    print('Usage: time_urlparse_file <filename>')
    exit(0)

filename = sys.argv[1]
total_url_count = 0

total_urllib = 0
total_f = 0
total_fc = 0
total_fcb = 0

curlparse.clear_cache()
urlparse_fast.clear_cache()
urlparse_urllib.clear_cache()

start_all = time.time()
for url in open(filename, 'r'):
    url_bytes = url.encode('utf-8')
    total_url_count += 1

    start = time.time()
    urlparse_fast.urlparse(url)
    total_f += time.time() - start

    start = time.time()
    curlparse.urlparse(url)
    total_fc += time.time() - start