def showMementosForURIRs(urir): urir = getCompleteURI(urir) if ipwbConfig.isLocalHosty(urir): urir = urir.split('/', 4)[4] s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False) indexPath = ipwbConfig.getIPWBReplayIndexPath() print('Getting CDXJ Lines with the URI-R {0} from {1}'.format( urir, indexPath)) cdxjLinesWithURIR = getCDXJLinesWithURIR(urir, indexPath) if len(cdxjLinesWithURIR) == 1: fields = cdxjLinesWithURIR[0].split(' ', 2) redirectURI = '/{1}/{0}'.format(unsurt(fields[0]), fields[1]) return redirect(redirectURI, code=302) msg = '' if cdxjLinesWithURIR: msg += '<p>{0} capture(s) available:</p><ul>'.format( len(cdxjLinesWithURIR)) for line in cdxjLinesWithURIR: fields = line.split(' ', 2) dt14 = fields[1] dtrfc1123 = ipwbConfig.datetimeToRFC1123(fields[1]) msg += ('<li><a href="/{1}/{0}">{0} at {2}</a></li>'.format( unsurt(fields[0]), dt14, dtrfc1123)) msg += '</ul>' return Response(msg)
def unsurt(self): """ urlkey is assumed to be in surt format by default In the case of non-surt format, this method is called to desurt any urls """ self.url_prefix = map(unsurt, self.url_prefix) if self.regex: self.regex = re.compile(unsurt(self.regex.pattern)) if self.replace: self.replace = unsurt(self.replace)
def generateTimeMapFromCDXJLines(cdxjLines, original, tmself): tmData = '<{0}>; rel="original",\n'.format(unsurt(original)) tmData += '<{0}>; rel="self"; '.format(tmself) tmData += 'type="application/link-format",\n' hostAndPort = tmself[0:tmself.index('timemap/')] for line in cdxjLines: (surtURI, datetime, json) = line.split(' ', 2) dtRFC1123 = ipwbConfig.datetimeToRFC1123(datetime) tmData += '<{0}{1}/{2}>; rel="memento"; datetime="{3}",\n'.format( hostAndPort, datetime, unsurt(surtURI), dtRFC1123) tmData = tmData[0:-2] # Trim final , and LF return tmData
def getURIsInCDXJ(cdxjFile=INDEX_FILE): with open(cdxjFile) as indexFile: uris = [] for i, l in enumerate(indexFile): uris.append(unsurt(l.split(' ')[0])) pass return json.dumps(uris)
def getURIsAndDatetimesInCDXJ(cdxjFilePath=INDEX_FILE): indexFileContents = getIndexFileContents(cdxjFilePath) if not indexFileContents: return 0 lines = indexFileContents.strip().split('\n') uris = {} for i, l in enumerate(lines): if not ipwbConfig.isValidCDXJLine(l): continue if ipwbConfig.isCDXJMetadataRecord(l): continue cdxjFields = l.split(' ', 2) uri = unsurt(cdxjFields[0]) datetime = cdxjFields[1] try: jsonFields = json.loads(cdxjFields[2]) except Exception as e: # Skip lines w/o JSON block continue if uri not in uris: uris[uri] = {} uris[uri]['datetimes'] = [] uris[uri]['datetimes'].append(datetime) uris[uri]['mime'] = jsonFields['mime_type'] pass return json.dumps(uris)
def resolveMemento(urir, datetime): """ Request a URI-R at a supplied datetime from the CDXJ """ urir = getCompleteURI(urir) if ipwbUtils.isLocalHosty(urir): urir = urir.split('/', 4)[4] s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False) indexPath = ipwbUtils.getIPWBReplayIndexPath() print('Getting CDXJ Lines with the URI-R {0} from {1}'.format( urir, indexPath)) cdxjLinesWithURIR = getCDXJLinesWithURIR(urir, indexPath) closestLine = getCDXJLineClosestTo(datetime, cdxjLinesWithURIR) if closestLine is None: msg = '<h1>ERROR 404</h1>' msg += 'No capture found for {0} at {1}.'.format(urir, datetime) return Response(msg, status=404) uri = unsurt(closestLine.split(' ')[0]) newDatetime = closestLine.split(' ')[1] linkHeader = getLinkHeaderAbbreviatedTimeMap(urir, newDatetime) return (newDatetime, linkHeader, uri)
def generateCDXJTimeMapFromCDXJLines(cdxjLines, original, tmself): tmurl = getProxiedURIT(tmself) if app.proxy is not None: tmself = urlunsplit(tmurl) # unsurted URI will never have a scheme, add one originalURI = 'http://{0}'.format(unsurt(original)) tmData = '!context ["http://tools.ietf.org/html/rfc7089"]\n' tmData += '!id {{"uri": "{0}"}}\n'.format(tmself) tmData += '!keys ["memento_datetime_YYYYMMDDhhmmss"]\n' tmData += '!meta {{"original_uri": "{0}"}}\n'.format(originalURI) linkTMURI = tmself.replace('/timemap/cdxj/', '/timemap/link/') tmData += ('!meta {{"timemap_uri": {{' '"link_format": "{0}", ' '"cdxj_format": "{1}"' '}}}}\n').format(linkTMURI, tmself) hostAndPort = tmself[0:tmself.index('timemap/')] for i, line in enumerate(cdxjLines): (surtURI, datetime, json) = line.split(' ', 2) dtRFC1123 = ipwbConfig.datetimeToRFC1123(datetime) firstLastStr = '' if len(cdxjLines) > 1: if i == 0: firstLastStr = 'first ' elif i == len(cdxjLines) - 1: firstLastStr = 'last ' elif len(cdxjLines) == 1: firstLastStr = 'first last ' tmData += ('{1} {{' '"uri": "{0}{1}/{2}", ' '"rel": "{3}memento", ' '"datetime"="{4}"}}\n').format(hostAndPort, datetime, unsurt(surtURI), firstLastStr, dtRFC1123) tmData = tmData[0:-1] # Trim final , and LF return tmData
def generateNoMementosInterface(path, datetime): msg = '<h1>ERROR 404</h1>' msg += 'No capture found for {0} at {1}.'.format(path, datetime) linesWithSameURIR = getCDXJLinesWithURIR(path, None) print('CDXJ lines with URI-R at {0}'.format(path)) print(linesWithSameURIR) # TODO: Use closest instead of conditioning on single entry # temporary fix for core functionality in #225 if len(linesWithSameURIR) == 1: fields = linesWithSameURIR[0].split(' ', 2) redirectURI = '/{1}/{0}'.format(unsurt(fields[0]), fields[1]) return redirect(redirectURI, code=302) urir = '' if linesWithSameURIR: msg += '<p>{0} capture(s) available:</p><ul>'.format( len(linesWithSameURIR)) for line in linesWithSameURIR: fields = line.split(' ', 2) urir = unsurt(fields[0]) msg += ('<li><a href="/{1}/{0}">{0} at {1}</a></li>'.format( urir, fields[1])) msg += '</ul>' msg += '<p>TimeMaps: ' msg += '<a href="/timemap/link/{0}">Link</a> '.format(urir) msg += '<a href="/timemap/cdxj/{0}">CDXJ</a> '.format(urir) resp = Response(msg, status=404) linkHeader = getLinkHeaderAbbreviatedTimeMap(path, datetime) linkHeader = linkHeader.replace('\n', ' ') # By default, a TM has a self-reference URI-T linkHeader = linkHeader.replace('self timemap', 'timemap') resp.headers['Link'] = linkHeader return resp
def generateLinkTimeMapFromCDXJLines(cdxjLines, original, tmself, tgURI): tmurl = getProxiedURIT(tmself) if app.proxy is not None: tmself = urlunsplit(tmurl) # Extract and trim for host:port prepending tmurl[2] = '' # Clear TM path hostAndPort = urlunsplit(tmurl) + '/' # unsurted URI will never have a scheme, add one originalURI = 'http://{0}'.format(unsurt(original)) tmData = '<{0}>; rel="original",\n'.format(originalURI) tmData += '<{0}>; rel="self timemap"; '.format(tmself) tmData += 'type="application/link-format",\n' cdxjTMURI = tmself.replace('/timemap/link/', '/timemap/cdxj/') tmData += '<{0}>; rel="timemap"; '.format(cdxjTMURI) tmData += 'type="application/cdxj+ors",\n' tmData += '<{0}>; rel="timegate"'.format(tgURI) for i, line in enumerate(cdxjLines): (surtURI, datetime, json) = line.split(' ', 2) dtRFC1123 = ipwbUtils.digits14ToRFC1123(datetime) firstLastStr = '' if len(cdxjLines) > 1: if i == 0: firstLastStr = 'first ' elif i == len(cdxjLines) - 1: firstLastStr = 'last ' elif len(cdxjLines) == 1: firstLastStr = 'first last ' tmData += ',\n<{0}memento/{1}/{2}>; rel="{3}memento"; datetime="{4}"' \ .format(hostAndPort, datetime, unsurt(surtURI), firstLastStr, dtRFC1123) return tmData + '\n'
def getURIsAndDatetimesInCDXJ(cdxjFilePath=INDEX_FILE): indexFileContents = getIndexFileContents(cdxjFilePath) if not indexFileContents: return 0 lines = indexFileContents.strip().split('\n') uris = {} for i, l in enumerate(lines): if l[0] == '!': # Metadata field continue cdxjFields = l.split(' ') uri = unsurt(cdxjFields[0]) datetime = cdxjFields[1] if uri not in uris: uris[uri] = [] uris[uri].append(datetime) pass return json.dumps(uris)
def showMemento(urir, datetime): s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False) indexPath = ipwbConfig.getIPWBReplayIndexPath() cdxjLinesWithURIR = getCDXJLinesWithURIR(urir, indexPath) print('Resolving request for {0} at {1}'.format(urir, datetime)) print('Found {0} cdxj entrie(s) for '.format(len(cdxjLinesWithURIR))) print('MEMENTOS:') print(cdxjLinesWithURIR) closestLine = getCDXJLineClosestTo(datetime, cdxjLinesWithURIR) print "best line: " + closestLine if closestLine is None: msg = '<h1>ERROR 404</h1>' msg += 'No capture found for {0} at {1}.'.format(path, datetime) return Response(msg, status=404) uri = unsurt(closestLine.split(' ')[0]) return show_uri(uri, datetime)
def showMemento(urir, datetime): """ Request a URI-R at a supplied datetime from the CDXJ """ urir = getCompleteURI(urir) if ipwbConfig.isLocalHosty(urir): urir = urir.split('/', 4)[4] s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False) indexPath = ipwbConfig.getIPWBReplayIndexPath() print('Getting CDXJ Lines with the URI-R {0} from {1}'.format( urir, indexPath)) cdxjLinesWithURIR = getCDXJLinesWithURIR(urir, indexPath) closestLine = getCDXJLineClosestTo(datetime, cdxjLinesWithURIR) if closestLine is None: msg = '<h1>ERROR 404</h1>' msg += 'No capture found for {0} at {1}.'.format(urir, datetime) return Response(msg, status=404) uri = unsurt(closestLine.split(' ')[0]) newDatetime = closestLine.split(' ')[1] if newDatetime != datetime: return redirect('/memento/{0}/{1}'.format(newDatetime, urir), code=302) return show_uri(uri, newDatetime)
def __init__(self, jdic): self.message = jdic['message'] m = self.message.replace('got timemap request url:count, ', '') self.url = jdic.get('url', m.replace(m[m.rfind(':'):], '')) self.can = canonicalize(self.url).split(')/') self.domain = unsurt('%s%s' % (self.can[0], ')/'))
def show_uri(path, datetime=None): global IPFS_API if len(path) == 0: return showWebUI('index.html') sys.exit() if path == 'serviceWorker.js': return getServiceWorker(path) sys.exit() daemonAddress = '{0}:{1}'.format(IPFSAPI_IP, IPFSAPI_PORT) if not ipwbConfig.isDaemonAlive(daemonAddress): errStr = ('IPFS daemon not running. ' 'Start it using $ ipfs daemon on the command-line ' ' or from the <a href="/">' 'IPWB replay homepage</a>.') return Response(errStr) cdxjLine = '' try: surtedURI = surt.surt( # Good ol' pep8 line length path, path_strip_trailing_slash_unless_empty=False) indexPath = ipwbConfig.getIPWBReplayIndexPath() searchString = surtedURI if datetime is not None: searchString = surtedURI + ' ' + datetime cdxjLine = getCDXJLine_binarySearch(searchString, indexPath) except: print sys.exc_info()[0] respString = ('{0} not found :(' + ' <a href="http://{1}:{2}">Go home</a>').format( path, IPWBREPLAY_IP, IPWBREPLAY_PORT) return Response(respString) if cdxjLine is None: # Resource not found in archives msg = '<h1>ERROR 404</h1>' msg += 'No capture found for {0} at {1}.'.format(path, datetime) linesWithSameURIR = getCDXJLinesWithURIR(path) if linesWithSameURIR: msg += '<p>{0} capture(s) available:</p><ul>'.format( len(linesWithSameURIR)) for line in linesWithSameURIR: fields = line.split(' ', 2) msg += ('<li><a href="/{1}/{0}">{0} at {1}</a></li>'.format( unsurt(fields[0]), fields[1])) msg += '</ul>' return Response(msg, status=404) cdxjParts = cdxjLine.split(" ", 2) jObj = json.loads(cdxjParts[2]) datetime = cdxjParts[1] digests = jObj['locator'].split('/') try: payload = IPFS_API.cat(digests[-1], timeout=1) header = IPFS_API.cat(digests[-2]) except ipfsapi.exceptions.TimeoutError: print "{0} not found at {1}".format(cdxjParts[0], digests[-1]) respString = ('{0} not found in IPFS :(' + ' <a href="http://{1}:{2}">Go home</a>').format( path, IPWBREPLAY_IP, IPWBREPLAY_PORT) return Response(respString) except: print sys.exc_info()[0] print "general error" sys.exit() if 'encryption_method' in jObj: pKey = XOR.new(jObj['encryption_key']) payload = pKey.decrypt(base64.b64decode(payload)) hKey = XOR.new(jObj['encryption_key']) header = hKey.decrypt(base64.b64decode(header)) hLines = header.split('\n') hLines.pop(0) resp = Response(payload) for idx, hLine in enumerate(hLines): k, v = hLine.split(': ', 1) if k.lower() == 'transfer-encoding' and v.lower() == 'chunked': resp.set_data(extractResponseFromChunkedData(payload)) if k.lower() != "content-type": k = "X-Archive-Orig-" + k resp.headers[k] = v resp.headers['Memento-Datetime'] = ipwbConfig.datetimeToRFC1123(datetime) return resp