Esempio n. 1
0
    def getURLString(self, surt=False, public_suffix=False):

        if None != self.opaque:
            return self.opaque

        if 'dns' == self.scheme:
            s = self.scheme + ':'  ###java version adds :// regardless of scheme
        else:  ###java version uses opaque type for dns urls, but this version supports dns urls
            s = self.scheme + '://'
        if surt:
            s += "("

        if self.authUser:
            s += self.authUser
            if self.authPass:
                s += self.authPass
            s += '@'

        hostSrc = self.host
        if public_suffix:
            hostSrc = self.getPublicSuffix()
        if surt:
            hostSrc = hostToSURT(hostSrc)
        s += hostSrc

        if self.port != self.DEFAULT_PORT:
            s += ":%d" % self.port

        if surt:
            s += ')'

        hasPath = (None != self.path) and (len(self.path) > 0)
        if hasPath:
            s += self.path
        else:
            if (None != self.query) or (None != self.hash):
                #must have '/' with query or hash:
                s += '/'

        if None != self.query:
            s += '?' + self.query
        if None != self.hash:
            s += '#' + self.hash

        if None != self.last_delimiter:
            s += self.last_delimiter

        return s
Esempio n. 2
0
    def getURLString(self, surt=False, public_suffix=False):

        if None != self.opaque:
            return self.opaque

        if 'dns' == self.scheme:
            s = self.scheme + ':'   ###java version adds :// regardless of scheme
        else:                       ###java version uses opaque type for dns urls, but this version supports dns urls
            s = self.scheme + '://'
        if surt:
            s += "("

        if self.authUser:
            s += self.authUser
            if self.authPass:
                s += self.authPass
            s += '@'

        hostSrc = self.host
        if public_suffix:
            hostSrc = self.getPublicSuffix()
        if surt:
            hostSrc = hostToSURT(hostSrc)
        s += hostSrc

        if self.port != self.DEFAULT_PORT:
            s += ":%d" % self.port

        if surt:
            s += ')'

        hasPath = (None != self.path) and (len(self.path) > 0)
        if hasPath:
            s += self.path
        else:
            if (None != self.query) or (None != self.hash):
                #must have '/' with query or hash:
                s += '/'

        if None != self.query:
            s += '?' + self.query
        if None != self.hash:
            s += '#' + self.hash

        if None != self.last_delimiter:
            s += self.last_delimiter

        return s
Esempio n. 3
0
File: surt.py Progetto: rajbot/surt
def surt(url):
    """
    These doctests are from WaybackURLKeyMakerTest.java

    >>> surt(None)
    '-'
    >>> surt('')
    '-'
    >>> surt("filedesc:foo.arc.gz")
    'filedesc:foo.arc.gz'
    >>> surt("filedesc:/foo.arc.gz")
    'filedesc:/foo.arc.gz'
    >>> surt("filedesc://foo.arc.gz")
    'filedesc://foo.arc.gz'
    >>> surt("warcinfo:foo.warc.gz")
    'warcinfo:foo.warc.gz'
    >>> surt("dns:alexa.com")
    'com,alexa)'
    >>> surt("dns:archive.org")
    'org,archive)'

    >>> surt("http://www.archive.org/")
    'org,archive)/'
    >>> surt("http://archive.org/")
    'org,archive)/'
    >>> surt("http://archive.org/goo/")
    'org,archive)/goo'
    >>> surt("http://archive.org/goo/?")
    'org,archive)/goo'
    >>> surt("http://archive.org/goo/?b&a")
    'org,archive)/goo?a&b'
    >>> surt("http://archive.org/goo/?a=2&b&a=1")
    'org,archive)/goo?a=1&a=2&b'

    PHP session id:
    >>> surt("http://archive.org/index.php?PHPSESSID=0123456789abcdefghijklemopqrstuv&action=profile;u=4221")
    'org,archive)/index.php?action=profile;u=4221'

    WHOIS url:
    >>> surt("whois://whois.isoc.org.il/shaveh.co.il")
    'whois://whois.isoc.org.il/shaveh.co.il'

    Yahoo web bug. See https://github.com/internetarchive/surt/issues/1
    >>> surt('http://visit.webhosting.yahoo.com/visit.gif?&r=http%3A//web.archive.org/web/20090517140029/http%3A//anthonystewarthead.electric-chi.com/&b=Netscape%205.0%20%28Windows%3B%20en-US%29&s=1366x768&o=Win32&c=24&j=true&v=1.2')
    'com,yahoo,webhosting,visit)/visit.gif?&b=netscape%205.0%20(windows;%20en-us)&c=24&j=true&o=win32&r=http://web.archive.org/web/20090517140029/http://anthonystewarthead.electric-chi.com/&s=1366x768&v=1.2'
    """

    if not url:
        return "-"

    if url.startswith("filedesc"):
        return url

    if url.startswith("warcinfo"):
        return url

    if url.startswith("dns:"):
        return hostToSURT(url[4:]) + ')'

    if url.startswith("whois://"):
        return url

    hurl = canonicalize(handyurl.parse(url))
    key = hurl.getURLString(surt=True)

    parenIdx = key.find('(')
    if -1 == parenIdx:
        return url  #something very wrong

    return key[parenIdx + 1:]
Esempio n. 4
0
def surt(url):
    """
    These doctests are from WaybackURLKeyMakerTest.java

    >>> surt(None)
    '-'
    >>> surt('')
    '-'
    >>> surt("filedesc:foo.arc.gz")
    'filedesc:foo.arc.gz'
    >>> surt("filedesc:/foo.arc.gz")
    'filedesc:/foo.arc.gz'
    >>> surt("filedesc://foo.arc.gz")
    'filedesc://foo.arc.gz'
    >>> surt("warcinfo:foo.warc.gz")
    'warcinfo:foo.warc.gz'
    >>> surt("dns:alexa.com")
    'com,alexa)'
    >>> surt("dns:archive.org")
    'org,archive)'

    >>> surt("http://www.archive.org/")
    'org,archive)/'
    >>> surt("http://archive.org/")
    'org,archive)/'
    >>> surt("http://archive.org/goo/")
    'org,archive)/goo'
    >>> surt("http://archive.org/goo/?")
    'org,archive)/goo'
    >>> surt("http://archive.org/goo/?b&a")
    'org,archive)/goo?a&b'
    >>> surt("http://archive.org/goo/?a=2&b&a=1")
    'org,archive)/goo?a=1&a=2&b'

    PHP session id:
    >>> surt("http://archive.org/index.php?PHPSESSID=0123456789abcdefghijklemopqrstuv&action=profile;u=4221")
    'org,archive)/index.php?action=profile;u=4221'

    WHOIS url:
    >>> surt("whois://whois.isoc.org.il/shaveh.co.il")
    'whois://whois.isoc.org.il/shaveh.co.il'

    Yahoo web bug. See https://github.com/internetarchive/surt/issues/1
    >>> surt('http://visit.webhosting.yahoo.com/visit.gif?&r=http%3A//web.archive.org/web/20090517140029/http%3A//anthonystewarthead.electric-chi.com/&b=Netscape%205.0%20%28Windows%3B%20en-US%29&s=1366x768&o=Win32&c=24&j=true&v=1.2')
    'com,yahoo,webhosting,visit)/visit.gif?&b=netscape%205.0%20(windows;%20en-us)&c=24&j=true&o=win32&r=http://web.archive.org/web/20090517140029/http://anthonystewarthead.electric-chi.com/&s=1366x768&v=1.2'
    """

    if not url:
        return "-"

    if url.startswith("filedesc"):
        return url

    if url.startswith("warcinfo"):
        return url

    if url.startswith("dns:"):
        return hostToSURT(url[4:]) + ')'

    if url.startswith("whois://"):
        return url

    hurl = canonicalize(handyurl.parse(url))
    key  = hurl.getURLString(surt=True)

    parenIdx = key.find('(')
    if -1 == parenIdx:
        return url #something very wrong

    return key[parenIdx+1:]