Beispiel #1
0
def showMementosForURIRs(urir):
    urir = getCompleteURI(urir)

    if ipwbConfig.isLocalHosty(urir):
        urir = urir.split('/', 4)[4]
    s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False)
    indexPath = ipwbConfig.getIPWBReplayIndexPath()

    print('Getting CDXJ Lines with the URI-R {0} from {1}'.format(
        urir, indexPath))
    cdxjLinesWithURIR = getCDXJLinesWithURIR(urir, indexPath)

    if len(cdxjLinesWithURIR) == 1:
        fields = cdxjLinesWithURIR[0].split(' ', 2)
        redirectURI = '/{1}/{0}'.format(unsurt(fields[0]), fields[1])
        return redirect(redirectURI, code=302)

    msg = ''
    if cdxjLinesWithURIR:
        msg += '<p>{0} capture(s) available:</p><ul>'.format(
            len(cdxjLinesWithURIR))
        for line in cdxjLinesWithURIR:
            fields = line.split(' ', 2)
            dt14 = fields[1]
            dtrfc1123 = ipwbConfig.datetimeToRFC1123(fields[1])
            msg += ('<li><a href="/{1}/{0}">{0} at {2}</a></li>'.format(
                unsurt(fields[0]), dt14, dtrfc1123))
        msg += '</ul>'
    return Response(msg)
Beispiel #2
0
    def unsurt(self):
        """
        urlkey is assumed to be in surt format by default
        In the case of non-surt format, this method is called
        to desurt any urls
        """
        self.url_prefix = map(unsurt, self.url_prefix)
        if self.regex:
            self.regex = re.compile(unsurt(self.regex.pattern))

        if self.replace:
            self.replace = unsurt(self.replace)
Beispiel #3
0
def generateTimeMapFromCDXJLines(cdxjLines, original, tmself):
    tmData = '<{0}>; rel="original",\n'.format(unsurt(original))

    tmData += '<{0}>; rel="self"; '.format(tmself)
    tmData += 'type="application/link-format",\n'

    hostAndPort = tmself[0:tmself.index('timemap/')]

    for line in cdxjLines:
        (surtURI, datetime, json) = line.split(' ', 2)
        dtRFC1123 = ipwbConfig.datetimeToRFC1123(datetime)
        tmData += '<{0}{1}/{2}>; rel="memento"; datetime="{3}",\n'.format(
            hostAndPort, datetime, unsurt(surtURI), dtRFC1123)
    tmData = tmData[0:-2]  # Trim final , and LF
    return tmData
Beispiel #4
0
def getURIsInCDXJ(cdxjFile=INDEX_FILE):
    with open(cdxjFile) as indexFile:
        uris = []
        for i, l in enumerate(indexFile):
            uris.append(unsurt(l.split(' ')[0]))
            pass
        return json.dumps(uris)
Beispiel #5
0
def getURIsAndDatetimesInCDXJ(cdxjFilePath=INDEX_FILE):
    indexFileContents = getIndexFileContents(cdxjFilePath)

    if not indexFileContents:
        return 0

    lines = indexFileContents.strip().split('\n')

    uris = {}
    for i, l in enumerate(lines):
        if not ipwbConfig.isValidCDXJLine(l):
            continue

        if ipwbConfig.isCDXJMetadataRecord(l):
            continue

        cdxjFields = l.split(' ', 2)
        uri = unsurt(cdxjFields[0])
        datetime = cdxjFields[1]

        try:
            jsonFields = json.loads(cdxjFields[2])
        except Exception as e:  # Skip lines w/o JSON block
            continue

        if uri not in uris:
            uris[uri] = {}
            uris[uri]['datetimes'] = []
        uris[uri]['datetimes'].append(datetime)
        uris[uri]['mime'] = jsonFields['mime_type']

        pass
    return json.dumps(uris)
Beispiel #6
0
def resolveMemento(urir, datetime):
    """ Request a URI-R at a supplied datetime from the CDXJ """
    urir = getCompleteURI(urir)

    if ipwbUtils.isLocalHosty(urir):
        urir = urir.split('/', 4)[4]
    s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False)
    indexPath = ipwbUtils.getIPWBReplayIndexPath()

    print('Getting CDXJ Lines with the URI-R {0} from {1}'.format(
        urir, indexPath))
    cdxjLinesWithURIR = getCDXJLinesWithURIR(urir, indexPath)

    closestLine = getCDXJLineClosestTo(datetime, cdxjLinesWithURIR)

    if closestLine is None:
        msg = '<h1>ERROR 404</h1>'
        msg += 'No capture found for {0} at {1}.'.format(urir, datetime)

        return Response(msg, status=404)

    uri = unsurt(closestLine.split(' ')[0])
    newDatetime = closestLine.split(' ')[1]

    linkHeader = getLinkHeaderAbbreviatedTimeMap(urir, newDatetime)

    return (newDatetime, linkHeader, uri)
Beispiel #7
0
def generateCDXJTimeMapFromCDXJLines(cdxjLines, original, tmself):
    tmurl = getProxiedURIT(tmself)
    if app.proxy is not None:
        tmself = urlunsplit(tmurl)

    # unsurted URI will never have a scheme, add one
    originalURI = 'http://{0}'.format(unsurt(original))

    tmData = '!context ["http://tools.ietf.org/html/rfc7089"]\n'
    tmData += '!id {{"uri": "{0}"}}\n'.format(tmself)
    tmData += '!keys ["memento_datetime_YYYYMMDDhhmmss"]\n'
    tmData += '!meta {{"original_uri": "{0}"}}\n'.format(originalURI)

    linkTMURI = tmself.replace('/timemap/cdxj/', '/timemap/link/')
    tmData += ('!meta {{"timemap_uri": {{'
               '"link_format": "{0}", '
               '"cdxj_format": "{1}"'
               '}}}}\n').format(linkTMURI, tmself)
    hostAndPort = tmself[0:tmself.index('timemap/')]

    for i, line in enumerate(cdxjLines):
        (surtURI, datetime, json) = line.split(' ', 2)
        dtRFC1123 = ipwbConfig.datetimeToRFC1123(datetime)
        firstLastStr = ''

        if len(cdxjLines) > 1:
            if i == 0:
                firstLastStr = 'first '
            elif i == len(cdxjLines) - 1:
                firstLastStr = 'last '
        elif len(cdxjLines) == 1:
            firstLastStr = 'first last '

        tmData += ('{1} {{'
                   '"uri": "{0}{1}/{2}", '
                   '"rel": "{3}memento", '
                   '"datetime"="{4}"}}\n').format(hostAndPort, datetime,
                                                  unsurt(surtURI),
                                                  firstLastStr, dtRFC1123)
    tmData = tmData[0:-1]  # Trim final , and LF
    return tmData
Beispiel #8
0
def generateNoMementosInterface(path, datetime):
    msg = '<h1>ERROR 404</h1>'
    msg += 'No capture found for {0} at {1}.'.format(path, datetime)
    linesWithSameURIR = getCDXJLinesWithURIR(path, None)
    print('CDXJ lines with URI-R at {0}'.format(path))
    print(linesWithSameURIR)

    # TODO: Use closest instead of conditioning on single entry
    #  temporary fix for core functionality in #225
    if len(linesWithSameURIR) == 1:
        fields = linesWithSameURIR[0].split(' ', 2)
        redirectURI = '/{1}/{0}'.format(unsurt(fields[0]), fields[1])
        return redirect(redirectURI, code=302)

    urir = ''
    if linesWithSameURIR:
        msg += '<p>{0} capture(s) available:</p><ul>'.format(
            len(linesWithSameURIR))
        for line in linesWithSameURIR:
            fields = line.split(' ', 2)
            urir = unsurt(fields[0])
            msg += ('<li><a href="/{1}/{0}">{0} at {1}</a></li>'.format(
                urir, fields[1]))
        msg += '</ul>'

    msg += '<p>TimeMaps: '
    msg += '<a href="/timemap/link/{0}">Link</a> '.format(urir)
    msg += '<a href="/timemap/cdxj/{0}">CDXJ</a> '.format(urir)

    resp = Response(msg, status=404)
    linkHeader = getLinkHeaderAbbreviatedTimeMap(path, datetime)
    linkHeader = linkHeader.replace('\n', ' ')

    # By default, a TM has a self-reference URI-T
    linkHeader = linkHeader.replace('self timemap', 'timemap')

    resp.headers['Link'] = linkHeader

    return resp
Beispiel #9
0
def generateLinkTimeMapFromCDXJLines(cdxjLines, original, tmself, tgURI):
    tmurl = getProxiedURIT(tmself)
    if app.proxy is not None:
        tmself = urlunsplit(tmurl)

    # Extract and trim for host:port prepending
    tmurl[2] = ''  # Clear TM path
    hostAndPort = urlunsplit(tmurl) + '/'

    # unsurted URI will never have a scheme, add one
    originalURI = 'http://{0}'.format(unsurt(original))

    tmData = '<{0}>; rel="original",\n'.format(originalURI)
    tmData += '<{0}>; rel="self timemap"; '.format(tmself)
    tmData += 'type="application/link-format",\n'

    cdxjTMURI = tmself.replace('/timemap/link/', '/timemap/cdxj/')
    tmData += '<{0}>; rel="timemap"; '.format(cdxjTMURI)
    tmData += 'type="application/cdxj+ors",\n'

    tmData += '<{0}>; rel="timegate"'.format(tgURI)

    for i, line in enumerate(cdxjLines):
        (surtURI, datetime, json) = line.split(' ', 2)
        dtRFC1123 = ipwbUtils.digits14ToRFC1123(datetime)
        firstLastStr = ''

        if len(cdxjLines) > 1:
            if i == 0:
                firstLastStr = 'first '
            elif i == len(cdxjLines) - 1:
                firstLastStr = 'last '
        elif len(cdxjLines) == 1:
            firstLastStr = 'first last '

        tmData += ',\n<{0}memento/{1}/{2}>; rel="{3}memento"; datetime="{4}"' \
                  .format(hostAndPort, datetime, unsurt(surtURI), firstLastStr,
                          dtRFC1123)
    return tmData + '\n'
Beispiel #10
0
def getURIsAndDatetimesInCDXJ(cdxjFilePath=INDEX_FILE):
    indexFileContents = getIndexFileContents(cdxjFilePath)

    if not indexFileContents:
        return 0

    lines = indexFileContents.strip().split('\n')

    uris = {}
    for i, l in enumerate(lines):
        if l[0] == '!':  # Metadata field
            continue
        cdxjFields = l.split(' ')
        uri = unsurt(cdxjFields[0])
        datetime = cdxjFields[1]
        if uri not in uris:
            uris[uri] = []
        uris[uri].append(datetime)

        pass
    return json.dumps(uris)
Beispiel #11
0
def showMemento(urir, datetime):
    s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False)
    indexPath = ipwbConfig.getIPWBReplayIndexPath()

    cdxjLinesWithURIR = getCDXJLinesWithURIR(urir, indexPath)

    print('Resolving request for {0} at {1}'.format(urir, datetime))
    print('Found {0} cdxj entrie(s) for '.format(len(cdxjLinesWithURIR)))
    print('MEMENTOS:')
    print(cdxjLinesWithURIR)

    closestLine = getCDXJLineClosestTo(datetime, cdxjLinesWithURIR)
    print "best line: " + closestLine
    if closestLine is None:
        msg = '<h1>ERROR 404</h1>'
        msg += 'No capture found for {0} at {1}.'.format(path, datetime)
        return Response(msg, status=404)

    uri = unsurt(closestLine.split(' ')[0])

    return show_uri(uri, datetime)
Beispiel #12
0
def showMemento(urir, datetime):
    """ Request a URI-R at a supplied datetime from the CDXJ """
    urir = getCompleteURI(urir)

    if ipwbConfig.isLocalHosty(urir):
        urir = urir.split('/', 4)[4]
    s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False)
    indexPath = ipwbConfig.getIPWBReplayIndexPath()

    print('Getting CDXJ Lines with the URI-R {0} from {1}'.format(
        urir, indexPath))
    cdxjLinesWithURIR = getCDXJLinesWithURIR(urir, indexPath)

    closestLine = getCDXJLineClosestTo(datetime, cdxjLinesWithURIR)
    if closestLine is None:
        msg = '<h1>ERROR 404</h1>'
        msg += 'No capture found for {0} at {1}.'.format(urir, datetime)
        return Response(msg, status=404)

    uri = unsurt(closestLine.split(' ')[0])
    newDatetime = closestLine.split(' ')[1]
    if newDatetime != datetime:
        return redirect('/memento/{0}/{1}'.format(newDatetime, urir), code=302)
    return show_uri(uri, newDatetime)
Beispiel #13
0
 def __init__(self, jdic):
     self.message = jdic['message']
     m = self.message.replace('got timemap request url:count, ', '')
     self.url = jdic.get('url', m.replace(m[m.rfind(':'):], ''))
     self.can = canonicalize(self.url).split(')/')
     self.domain = unsurt('%s%s' % (self.can[0], ')/'))
Beispiel #14
0
def show_uri(path, datetime=None):
    global IPFS_API

    if len(path) == 0:
        return showWebUI('index.html')
        sys.exit()

    if path == 'serviceWorker.js':
        return getServiceWorker(path)
        sys.exit()

    daemonAddress = '{0}:{1}'.format(IPFSAPI_IP, IPFSAPI_PORT)
    if not ipwbConfig.isDaemonAlive(daemonAddress):
        errStr = ('IPFS daemon not running. '
                  'Start it using $ ipfs daemon on the command-line '
                  ' or from the <a href="/">'
                  'IPWB replay homepage</a>.')
        return Response(errStr)
    cdxjLine = ''
    try:
        surtedURI = surt.surt(  # Good ol' pep8 line length
            path,
            path_strip_trailing_slash_unless_empty=False)
        indexPath = ipwbConfig.getIPWBReplayIndexPath()

        searchString = surtedURI
        if datetime is not None:
            searchString = surtedURI + ' ' + datetime

        cdxjLine = getCDXJLine_binarySearch(searchString, indexPath)
    except:
        print sys.exc_info()[0]
        respString = ('{0} not found :(' +
                      ' <a href="http://{1}:{2}">Go home</a>').format(
                          path, IPWBREPLAY_IP, IPWBREPLAY_PORT)
        return Response(respString)
    if cdxjLine is None:  # Resource not found in archives
        msg = '<h1>ERROR 404</h1>'
        msg += 'No capture found for {0} at {1}.'.format(path, datetime)
        linesWithSameURIR = getCDXJLinesWithURIR(path)

        if linesWithSameURIR:
            msg += '<p>{0} capture(s) available:</p><ul>'.format(
                len(linesWithSameURIR))
            for line in linesWithSameURIR:
                fields = line.split(' ', 2)
                msg += ('<li><a href="/{1}/{0}">{0} at {1}</a></li>'.format(
                    unsurt(fields[0]), fields[1]))
            msg += '</ul>'
        return Response(msg, status=404)

    cdxjParts = cdxjLine.split(" ", 2)

    jObj = json.loads(cdxjParts[2])
    datetime = cdxjParts[1]

    digests = jObj['locator'].split('/')

    try:
        payload = IPFS_API.cat(digests[-1], timeout=1)
        header = IPFS_API.cat(digests[-2])
    except ipfsapi.exceptions.TimeoutError:
        print "{0} not found at {1}".format(cdxjParts[0], digests[-1])
        respString = ('{0} not found in IPFS :(' +
                      ' <a href="http://{1}:{2}">Go home</a>').format(
                          path, IPWBREPLAY_IP, IPWBREPLAY_PORT)
        return Response(respString)
    except:
        print sys.exc_info()[0]
        print "general error"
        sys.exit()

    if 'encryption_method' in jObj:
        pKey = XOR.new(jObj['encryption_key'])
        payload = pKey.decrypt(base64.b64decode(payload))
        hKey = XOR.new(jObj['encryption_key'])
        header = hKey.decrypt(base64.b64decode(header))

    hLines = header.split('\n')
    hLines.pop(0)

    resp = Response(payload)

    for idx, hLine in enumerate(hLines):
        k, v = hLine.split(': ', 1)

        if k.lower() == 'transfer-encoding' and v.lower() == 'chunked':
            resp.set_data(extractResponseFromChunkedData(payload))
        if k.lower() != "content-type":
            k = "X-Archive-Orig-" + k

        resp.headers[k] = v

    resp.headers['Memento-Datetime'] = ipwbConfig.datetimeToRFC1123(datetime)

    return resp