def seeddata(urim, preferences): uricache = getURICache() httpcache = ManagedSession( timeout=current_app.config['REQUEST_TIMEOUT_FLOAT'], user_agent=__useragent__, starting_uri=urim, uricache=uricache) memento = memento_resource_factory(urim, httpcache) sr = SeedResource(memento, httpcache) output = {} output['urim'] = urim output['generation-time'] = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ") output['timemap'] = sr.urit output['original-url'] = sr.urir output['memento-count'] = sr.mementocount() output['first-memento-datetime'] = sr.first_mdt().strftime( "%Y-%m-%dT%H:%M:%SZ") output['first-urim'] = sr.first_urim() output['last-memento-datetime'] = sr.last_mdt().strftime( "%Y-%m-%dT%H:%M:%SZ") output['last-urim'] = sr.last_urim() output['metadata'] = sr.seed_metadata() response = make_response(json.dumps(output, indent=4)) response.headers['Content-Type'] = 'application/json' return response, 200
def sentencerank(urim, preferences): output = {} httpcache = getURICache(urim) memento = memento_resource_factory(urim, httpcache) output['urim'] = urim output['generation-time'] = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") scoredata = {} if preferences["algorithm"] == "readability/lede3": scoredata = get_sentence_scores_by_readability_and_lede3(memento.raw_content) elif preferences["algorithm"] == "readability/textrank": scoredata = get_sentence_scores_by_readability_and_textrank(memento.raw_content) elif preferences["algorithm"] == "justext/textrank": scoredata = get_sentence_scores_by_just_textrank(memento.raw_content) else: scoredata = get_sentence_scores_by_readability_and_lede3(memento.raw_content) output.update(scoredata) response = make_response(json.dumps(output, indent=4)) response.headers['Content-Type'] = 'application/json' response.headers['Preference-Applied'] = \ "algorithm={}".format(preferences['algorithm']) return response, 200
def archivedata(urim, preferences): httpcache = getURICache(urim) # TODO: only here because we need to detect NotAMemento, need a better solution memento = memento_resource_factory(urim, httpcache) archive = ArchiveResource(urim, httpcache) if preferences['datauri_favicon'].lower() == 'yes': archive_favicon = convert_imageuri_to_pngdata_uri( archive.favicon, httpcache, 16, 16 ) else: archive_favicon = archive.favicon output = {} output['urim'] = urim output['generation-time'] = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") output['archive-uri'] = archive.home_uri output['archive-name'] = archive.name output['archive-favicon'] = archive_favicon output['archive-collection-id'] = archive.collection_id output['archive-collection-name'] = archive.collection_name output['archive-collection-uri'] = archive.collection_uri response = make_response(json.dumps(output, indent=4)) response.headers['Content-Type'] = 'application/json' response.headers['Preference-Applied'] = \ "datauri_favicon={}".format(preferences['datauri_favicon']) return response, 200
def originaldata(urim, preferences): output = {} httpcache = ManagedSession( timeout=current_app.config['REQUEST_TIMEOUT_FLOAT'], user_agent=__useragent__, starting_uri=urim, uricache=getURICache()) memento = memento_resource_factory(urim, httpcache) originalresource = OriginalResource(memento, httpcache) if preferences['datauri_favicon'].lower() == 'yes': try: original_favicon = convert_imageuri_to_pngdata_uri( originalresource.favicon, httpcache, 16, 16) except ValueError as e: module_logger.exception( "an error occurred while generating a data URI for an original resource favicon" ) if str(e) == "not enough image data": original_favicon = "" else: raise e else: original_favicon = originalresource.favicon output['urim'] = urim output['generation-time'] = datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ") output['original-uri'] = originalresource.uri output['original-domain'] = originalresource.domain output['original-favicon'] = original_favicon output['original-linkstatus'] = originalresource.link_status response = make_response(json.dumps(output, indent=4)) response.headers['Content-Type'] = 'application/json' response.headers['Preference-Applied'] = \ "datauri_favicon={}".format(preferences['datauri_favicon']) return response, 200
def imagedata(urim, preferences): httpcache = ManagedSession( timeout=current_app.config['REQUEST_TIMEOUT_FLOAT'], user_agent=__useragent__, starting_uri=urim, uricache=getURICache()) memento = memento_resource_factory(urim, httpcache) output = {} output['urim'] = urim output['processed urim'] = memento.im_urim output['generation-time'] = datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ") output['images'] = generate_images_and_scores(memento.im_urim, httpcache) scorelist = [] output["ranked images"] = [] for imageuri in output['images']: if 'calculated score' in output['images'][imageuri]: scorelist.append( (output['images'][imageuri]["calculated score"], imageuri)) for item in sorted(scorelist, reverse=True): output["ranked images"].append(item[1]) if len(output["ranked images"]) == 0: output['images'] = generate_images_and_scores(memento.urim, httpcache) scorelist = [] output["ranked images"] = [] for imageuri in output['images']: if 'calculated score' in output['images'][imageuri]: scorelist.append( (output['images'][imageuri]["calculated score"], imageuri)) for item in sorted(scorelist, reverse=True): output["ranked images"].append(item[1]) response = make_response(json.dumps(output, indent=4)) response.headers['Content-Type'] = 'application/json' return response, 200
def paragraphrank(urim, preferences): output = {} httpcache = getURICache(urim) memento = memento_resource_factory(urim, httpcache) output['urim'] = urim output['generation-time'] = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") scoredata = get_section_scores_by_readability(memento.raw_content) output.update(scoredata) response = make_response(json.dumps(output, indent=4)) response.headers['Content-Type'] = 'application/json' return response, 200
def contentdata(urim, preferences): output = {} httpcache = getURICache(urim) memento = memento_resource_factory(urim, httpcache) output['urim'] = urim output['generation-time'] = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") output['title'] = extract_title(memento.raw_content) output['snippet'] = extract_text_snippet(memento.raw_content) output['memento-datetime'] = memento.memento_datetime.strftime("%Y-%m-%dT%H:%M:%SZ") response = make_response(json.dumps(output, indent=4)) response.headers['Content-Type'] = 'application/json' return response, 200
def imagedata(urim, preferences): httpcache = getURICache(urim) memento = memento_resource_factory(urim, httpcache) output = {} output['urim'] = urim output['processed urim'] = memento.im_urim output['generation-time'] = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") output['images'] = generate_images_and_scores(memento.im_urim, httpcache) scorelist = [] output["ranked images"] = [] module_logger.debug("images data structure: {}".format(pprint.pformat(output['images'], indent=4))) for imageuri in output['images']: module_logger.debug("looking for calculated score in imageuri {}".format(imageuri)) if output['images'][imageuri] is not None: if 'calculated score' in output['images'][imageuri]: scorelist.append( (output['images'][imageuri]["calculated score"], imageuri) ) for item in sorted(scorelist, reverse=True): output["ranked images"].append(item[1]) if len(output["ranked images"]) == 0: output['images'] = generate_images_and_scores(memento.urim, httpcache) scorelist = [] output["ranked images"] = [] for imageuri in output['images']: if 'calculated score' in output['images'][imageuri]: scorelist.append( (output['images'][imageuri]["calculated score"], imageuri) ) for item in sorted(scorelist, reverse=True): output["ranked images"].append(item[1]) response = make_response(json.dumps(output, indent=4)) response.headers['Content-Type'] = 'application/json' return response, 200
def bestimage(urim, preferences): httpcache = ManagedSession( timeout=current_app.config['REQUEST_TIMEOUT_FLOAT'], user_agent=__useragent__, starting_uri=urim, uricache=getURICache()) memento = memento_resource_factory(urim, httpcache) module_logger.debug("trying to find best image with {}".format( memento.im_urim)) best_image_uri = get_best_image(memento.im_urim, httpcache, current_app.config['DEFAULT_IMAGE_URI']) if best_image_uri == current_app.config['DEFAULT_IMAGE_URI']: module_logger.debug( "got back a blank image, trying again with {}".format( memento.urim)) best_image_uri = get_best_image( memento.urim, httpcache, current_app.config['DEFAULT_IMAGE_URI']) if preferences['datauri_image'].lower() == 'yes': if best_image_uri[0:5] != 'data:': best_image_uri = convert_imageuri_to_pngdata_uri( best_image_uri, httpcache, 96) output = {} output['urim'] = urim output['best-image-uri'] = best_image_uri output['generation-time'] = datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ") response = make_response(json.dumps(output, indent=4)) response.headers['Content-Type'] = 'application/json' response.headers['Preference-Applied'] = \ "datauri_image={}".format(preferences['datauri_image']) return response, 200
def paragraphrank(urim, preferences): output = {} httpcache = ManagedSession( timeout=current_app.config['REQUEST_TIMEOUT_FLOAT'], user_agent=__useragent__, starting_uri=urim, uricache=getURICache()) memento = memento_resource_factory(urim, httpcache) output['urim'] = urim output['generation-time'] = datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ") scoredata = get_section_scores_by_readability(memento.raw_content) output.update(scoredata) response = make_response(json.dumps(output, indent=4)) response.headers['Content-Type'] = 'application/json' return response, 200
def seeddata(urim, preferences): httpcache = getURICache(urim) memento = memento_resource_factory(urim, httpcache) sr = SeedResource(memento, httpcache) output = {} output['urim'] = urim output['generation-time'] = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ") output['timemap'] = sr.urit output['original-url'] = sr.urir output['memento-count'] = sr.mementocount() if sr.mementocount() is None: output['seeddata-error'] = "There was an issue processing the TimeMap discovered at {}".format(sr.urit) if sr.first_mdt() is not None: output['first-memento-datetime'] = sr.first_mdt().strftime("%Y-%m-%dT%H:%M:%SZ") else: output['first-memento-datetime'] = None output['first-urim'] = sr.first_urim() if sr.last_mdt() is not None: output['last-memento-datetime'] = sr.last_mdt().strftime("%Y-%m-%dT%H:%M:%SZ") else: output['last-memento-datetime'] = None output['last-urim'] = sr.last_urim() output['metadata'] = sr.seed_metadata() response = make_response(json.dumps(output, indent=4)) response.headers['Content-Type'] = 'application/json' return response, 200
def contentdata(urim, preferences): output = {} httpcache = ManagedSession( timeout=current_app.config['REQUEST_TIMEOUT_FLOAT'], user_agent=__useragent__, starting_uri=urim, uricache=getURICache()) memento = memento_resource_factory(urim, httpcache) output['urim'] = urim output['generation-time'] = datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ") output['title'] = extract_title(memento.raw_content) output['snippet'] = extract_text_snippet(memento.raw_content) output['memento-datetime'] = memento.memento_datetime.strftime( "%Y-%m-%dT%H:%M:%SZ") response = make_response(json.dumps(output, indent=4)) response.headers['Content-Type'] = 'application/json' return response, 200
def test_archiveiscase_datetime_in_uri(self): urim = "http://archive.is/20130508132946/http://flexispy.com/" zipurim = "http://archive.is/download/pSSpa.zip" expected_original_uri = "http://flexispy.com/" expected_urig = "http://archive.is/timegate/http://flexispy.com/" with open("{}/samples/archive.is-1.html".format(testdir), 'rb') as f: expected_content = f.read() with open("{}/samples/archive.is-1.raw.zip".format(testdir), 'rb') as f: zip_content = f.read() zf = zipfile.ZipFile(f) expected_raw_content = zf.read("index.html") cachedict = { urim: mock_response(headers={ 'content-type': 'text/html', 'memento-datetime': "Sat, 02 Feb 2008 06:29:13 GMT", 'link': """<{}>; rel="original", <{}>; rel="timegate", <http://myarchive.org/timemap/http://example.com/something>; rel="timemap", <{}>; rel="memento" """.format(expected_original_uri, expected_urig, urim) }, text=expected_content, status=200, url=urim, links={ "original": { "url": expected_original_uri }, "timegate": { "url": expected_urig } }), "http://archive.is/20130508132946id_/http://flexispy.com/": mock_response( headers={}, text="", status=404, url="http://archive.is/20130508132946id_/http://flexispy.com/" ), zipurim: mock_response(headers={ 'content-type': 'text/html', }, text="", content=zip_content, status=200, url=zipurim) } mh = mock_httpcache(cachedict) mr = memento_resource_factory(urim, mh) expected_mdt = datetime.strptime("Sat, 02 Feb 2008 06:29:13 GMT", "%a, %d %b %Y %H:%M:%S GMT") self.maxDiff = None self.assertEqual(type(mr), ArchiveIsMemento) self.assertEqual(mr.memento_datetime, expected_mdt) self.assertEqual(mr.timegate, expected_urig) self.assertEqual(mr.original_uri, expected_original_uri) self.assertEqual(mr.content, expected_content) self.assertEqual(mr.raw_content, expected_raw_content)
def test_waybackcase(self): urim = "http://myarchive.org/memento/20080202062913/http://example.com/something" raw_urim = "http://myarchive.org/memento/20080202062913id_/http://example.com/something" expected_urig = "http://myarchive.org/timegate/http://example.com/something" expected_original_uri = "http://example.com/something" expected_content = """ <html> <head> <title>Is this a good title?</title> </head> <body> <!-- ARCHIVE SPECIFIC STUFF --> Is this good text? </body> </html>""" expected_raw_content = """ <html> <head> <title>Is this a good title?</title> </head> <body> Is this good text? </body> </html>""" cachedict = { urim: mock_response(headers={ 'content-type': 'text/html', 'memento-datetime': "Sat, 02 Feb 2008 06:29:13 GMT", 'link': """<{}>; rel="original", <{}>; rel="timegate", <http://myarchive.org/timemap/http://example.com/something>; rel="timemap", <{}>; rel="memento" """.format(expected_original_uri, expected_urig, urim) }, text=expected_content, status=200, url=urim, links={ "original": { "url": expected_original_uri }, "timegate": { "url": expected_urig } }), raw_urim: mock_response(headers={'content-type': 'text/html'}, text=expected_raw_content, status=200, url=raw_urim) } mh = mock_httpcache(cachedict) mr = memento_resource_factory(urim, mh) expected_mdt = datetime.strptime("Sat, 02 Feb 2008 06:29:13 GMT", "%a, %d %b %Y %H:%M:%S GMT") self.assertEqual(type(mr), WaybackMemento) self.assertEqual(mr.memento_datetime, expected_mdt) self.assertEqual(mr.timegate, expected_urig) self.assertEqual(mr.original_uri, expected_original_uri) self.assertEqual(mr.content, expected_content) self.assertEqual(mr.raw_content, expected_raw_content)
def test_permacc_hashstyle_uris(self): urim = "http://perma.cc/RZP7-3P4P" expected_original_uri = "http://www.environment.gov.au/minister/hunt/2014/mr20141215a.html" expected_urim = "https://perma-archives.org/warc/20151028031045/http://www.environment.gov.au/minister/hunt/2014/mr20141215a.html" expected_raw_uri = "https://perma-archives.org/warc/20151028031045id_/http://www.environment.gov.au/minister/hunt/2014/mr20141215a.html" expected_urig = "https://perma-archives.org/warc/timegate/http://www.environment.gov.au/minister/hunt/2014/mr20141215a.html" expected_content = "hi" expected_raw_content = "hi there" cachedict = { urim: mock_response( headers = { 'content-type': 'text/html', 'memento-datetime': "Sat, 02 Feb 2008 06:29:13 GMT", 'link': """<{}>; rel="original", <{}>; rel="timegate", <http://myarchive.org/timemap/http://example.com/something>; rel="timemap", <{}>; rel="memento" """.format(expected_original_uri, expected_urig, urim) }, text = expected_content, status=200, url = urim, links = { "original": { "url": expected_original_uri }, "timegate": { "url": expected_urig } } ), expected_raw_uri: mock_response( headers = { 'content-type': 'text/html', 'memento-datetime': "Sat, 02 Feb 2008 06:29:13 GMT", 'link': """<{}>; rel="original", <{}>; rel="timegate", <http://myarchive.org/timemap/http://example.com/something>; rel="timemap", <{}>; rel="memento" """.format(expected_original_uri, expected_urig, expected_urim) }, text = expected_raw_content, status = 200, url = expected_raw_uri, links = { "original": { "url": expected_original_uri }, "timegate": { "url": expected_urig } } ), expected_urig: # requests follows all redirects, so we present the result at the end of the chain mock_response( headers = { 'content-type': 'text/html', 'memento-datetime': "Sat, 02 Feb 2008 06:29:13 GMT", 'link': """<{}>; rel="original", <{}>; rel="timegate", <http://myarchive.org/timemap/http://example.com/something>; rel="timemap", <{}>; rel="memento" """.format(expected_original_uri, expected_urig, expected_urim) }, text = expected_content, status = 200, # after following redirects url = expected_urim, links = { "original": { "url": expected_original_uri }, "timegate": { "url": expected_urig } } ), expected_urim: mock_response( headers = { 'content-type': 'text/html', 'memento-datetime': "Sat, 02 Feb 2008 06:29:13 GMT", 'link': """<{}>; rel="original", <{}>; rel="timegate", <http://myarchive.org/timemap/http://example.com/something>; rel="timemap", <{}>; rel="memento" """.format(expected_original_uri, expected_urig, expected_urim) }, text = expected_content, status = 200, # after following redirects url = expected_urim, links = { "original": { "url": expected_original_uri }, "timegate": { "url": expected_urig } } ) } mh = mock_httpcache(cachedict) mr = memento_resource_factory(urim, mh) expected_mdt = datetime.strptime("Sat, 02 Feb 2008 06:29:13 GMT", "%a, %d %b %Y %H:%M:%S GMT") self.maxDiff = None self.assertEqual(type(mr), WaybackMemento) self.assertEqual(mr.memento_datetime, expected_mdt) self.assertEqual(mr.timegate, expected_urig) self.assertEqual(mr.original_uri, expected_original_uri) self.assertEqual(mr.content, expected_content) self.assertEqual(mr.raw_content, expected_raw_content)
def test_simplecase(self): urim = "http://myarchive.org/memento/http://example.com/something" expected_urig = "http://myarchive.org/timegate/http://example.com/something" expected_original_uri = "http://example.com/something" expected_content = """ <html> <head> <title>Is this a good title?</title> </head> <body> Is this good text? </body> </html>""" cachedict = { urim: mock_response( headers = { 'content-type': 'text/html', 'memento-datetime': "Fri, 22 Jun 2018 21:16:36 GMT", 'link': """<{}>; rel="original", <{}>; rel="timegate", <http://myarchive.org/timemap/http://example.com/something>; rel="timemap", <{}>; rel="memento" """.format(expected_original_uri, expected_urig, urim) }, text = expected_content, status=200, url = urim, links = { "original": { "url": expected_original_uri }, "timegate": { "url": expected_urig } } ), expected_urig: # requests follows all redirects, so we present the result at the end of the chain mock_response( headers = { 'content-type': 'text/html', 'memento-datetime': "Fri, 22 Jun 2018 21:16:36 GMT", 'link': """<{}>; rel="original", <{}>; rel="timegate", <http://myarchive.org/timemap/http://example.com/something>; rel="timemap", <{}>; rel="memento" """.format(expected_original_uri, expected_urig, urim) }, text = expected_content, status=200, url = urim, links = { "original": { "url": expected_original_uri }, "timegate": { "url": expected_urig } } ) } mh = mock_httpcache(cachedict) mr = memento_resource_factory(urim, mh) expected_mdt = datetime.strptime("Fri, 22 Jun 2018 21:16:36 GMT", "%a, %d %b %Y %H:%M:%S GMT") self.assertEqual(type(mr), MementoResource) self.assertEqual(mr.memento_datetime, expected_mdt) self.assertEqual(mr.timegate, expected_urig) self.assertEqual(mr.original_uri, expected_original_uri) self.assertEqual(mr.content, expected_content) self.assertEqual(mr.raw_content, expected_content)
def test_favicon_from_html(self): urim = "http://myarchive.org/memento/http://example.com/something" expected_urig = "http://myarchive.org/timegate/http://example.com/something" expected_original_uri = "http://example.com/something" expected_favicon = "http://myarchive.org/memento/http://example.com/content/favicon.ico" original_favicon = "http://example.com/content/favicon.ico" favicon_urig = "http://myarchive.org/timegate/http://example.com/favicon.ico" expected_content = """ <html> <head> <title>Is this a good title?</title> <link rel="icon" href="{}" > </head> <body> Is this good text? </body> </html>""".format(original_favicon) cachedict = { urim: mock_response(headers={ 'content-type': 'text/html', 'memento-datetime': "Fri, 22 Jun 2018 21:16:36 GMT", 'link': """<{}>; rel="original", <{}>; rel="timegate", <http://myarchive.org/timemap/http://example.com/something>; rel="timemap", <{}>; rel="memento" """.format(expected_original_uri, expected_urig, urim) }, text=expected_content, status=200, url=urim, links={ "original": { "url": expected_original_uri }, "timegate": { "url": expected_urig } }), expected_urig: mock_response(headers={ 'content-type': 'text/html', 'memento-datetime': "Fri, 22 Jun 2018 21:16:36 GMT", 'link': """<{}>; rel="original", <{}>; rel="timegate", <http://myarchive.org/timemap/http://example.com/something>; rel="timemap", <{}>; rel="memento" """.format(expected_original_uri, expected_urig, urim) }, text=expected_content, status=200, url=urim, links={ "original": { "url": expected_original_uri }, "timegate": { "url": expected_urig } }), original_favicon: mock_response(headers={ 'content-type': 'image/', }, text=expected_content, status=200, url=expected_favicon), expected_original_uri: mock_response(headers={ 'content-type': 'text/html', }, text="", status=200, url=expected_original_uri), "http://myarchive.org/timegate/http://example.com/content/favicon.ico": mock_response(headers={"location": expected_favicon}, text="", status=200, url=expected_favicon), favicon_urig: mock_response(headers={'content-type': 'image/'}, text="a", status=200, url=expected_favicon), expected_favicon: mock_response(headers={'content-type': 'image/'}, text="a", status=200, url=expected_favicon) } mh = mock_httpcache(cachedict) mr = memento_resource_factory(urim, mh) ores = OriginalResource(mr, mh) self.assertEqual(ores.domain, "example.com") self.assertEqual(ores.uri, "http://example.com/something") self.assertEqual(ores.link_status, "Live") self.assertEqual(ores.favicon, expected_favicon)
def test_archiveiscase(self): urim = "http://archive.is/abcd1234" zipurim = "http://archive.is/download/abcd1234.zip" expected_original_uri = "http://example.com/something" expected_urig = "http://myarchive.org/timegate/http://example.com/something" expected_raw_content = """ <html> <head> <title>Is this a good title?</title> </head> <body> Is this good text? </body> </html>""" expected_content = """ <html> <head> <title>ARCHIVED: Is this a good title?</title> </head> <body> <p>Some Archive-specific stuff here</p> <div id="SOLID">{}</div> </body> </html>""".format(expected_raw_content) file_like_object = io.BytesIO() zf = zipfile.ZipFile(file_like_object, mode='w') zf.writestr('index.html', expected_raw_content) zf.close() zip_content = file_like_object.getvalue() cachedict = { urim: mock_response(headers={ 'content-type': 'text/html', 'memento-datetime': "Sat, 02 Feb 2008 06:29:13 GMT", 'link': """<{}>; rel="original", <{}>; rel="timegate", <http://myarchive.org/timemap/http://example.com/something>; rel="timemap", <{}>; rel="memento" """.format(expected_original_uri, expected_urig, urim) }, text=expected_content, status=200, url=urim, links={ "original": { "url": expected_original_uri }, "timegate": { "url": expected_urig } }), zipurim: mock_response(headers={'content-type': 'text/html'}, text="", content=zip_content, status=200, url=zipurim) } mh = mock_httpcache(cachedict) mr = memento_resource_factory(urim, mh) expected_mdt = datetime.strptime("Sat, 02 Feb 2008 06:29:13 GMT", "%a, %d %b %Y %H:%M:%S GMT") self.maxDiff = None self.assertEqual(type(mr), ArchiveIsMemento) self.assertEqual(mr.memento_datetime, expected_mdt) self.assertEqual(mr.timegate, expected_urig) self.assertEqual(mr.original_uri, expected_original_uri) self.assertEqual(mr.content, expected_content) self.assertEqual(mr.raw_content, bytes(expected_raw_content.encode('utf-8')))
def test_imfcase(self): urim = "http://myarchive.org/memento/notraw/http://example.com/something" raw_urim = "http://myarchive.org/memento/raw/http://example.com/something" expected_urig = "http://myarchive.org/timegate/http://example.com/something" expected_original_uri = "http://example.com/something" expected_content = """ <html> <head> <title>ARCHIVED: Is this a good title?</title> </head> <body> <p>Some Archive-specific stuff here</p> <iframe id="theWebpage" src="{}"></iframe> </body> </html>""".format(raw_urim) expected_raw_content = """ <html> <head> <title>Is this a good title?</title> </head> <body> Is this good text? </body> </html>""" cachedict = { urim: mock_response(headers={ 'content-type': 'text/html', 'memento-datetime': "Sat, 02 Feb 2008 06:29:13 GMT", 'link': """<{}>; rel="original", <{}>; rel="timegate", <http://myarchive.org/timemap/http://example.com/something>; rel="timemap", <{}>; rel="memento" """.format(expected_original_uri, expected_urig, urim) }, text=expected_content, status=200, url=urim, links={ "original": { "url": expected_original_uri }, "timegate": { "url": expected_urig } }), raw_urim: mock_response(headers={'content-type': 'text/html'}, text=expected_raw_content, status=200, url=raw_urim) } mh = mock_httpcache(cachedict) mr = memento_resource_factory(urim, mh) expected_mdt = datetime.strptime("Sat, 02 Feb 2008 06:29:13 GMT", "%a, %d %b %Y %H:%M:%S GMT") self.assertEqual(type(mr), IMFMemento) self.assertEqual(mr.memento_datetime, expected_mdt) self.assertEqual(mr.timegate, expected_urig) self.assertEqual(mr.original_uri, expected_original_uri) self.assertEqual(mr.content, expected_content) self.assertEqual(mr.raw_content, expected_raw_content)
def test_simplecase_rotten_resource(self): urim = "http://myarchive.org/memento/http://example.com/something" expected_urig = "http://myarchive.org/timegate/http://example.com/something" expected_original_uri = "http://example.com/something" expected_content = """ <html> <head> <title>Is this a good title?</title> </head> <body> Is this good text? </body> </html>""" cachedict = { urim: mock_response(headers={ 'content-type': 'text/html', 'memento-datetime': "Fri, 22 Jun 2018 21:16:36 GMT", 'link': """<{}>; rel="original", <{}>; rel="timegate", <http://myarchive.org/timemap/http://example.com/something>; rel="timemap", <{}>; rel="memento" """.format(expected_original_uri, expected_urig, urim) }, text=expected_content, status=200, url=urim, links={ "original": { "url": expected_original_uri }, "timegate": { "url": expected_urig } }), expected_urig: mock_response(headers={ 'content-type': 'text/html', 'memento-datetime': "Fri, 22 Jun 2018 21:16:36 GMT", 'link': """<{}>; rel="original", <{}>; rel="timegate", <http://myarchive.org/timemap/http://example.com/something>; rel="timemap", <{}>; rel="memento" """.format(expected_original_uri, expected_urig, urim) }, text=expected_content, status=200, url=urim, links={ "original": { "url": expected_original_uri }, "timegate": { "url": expected_urig } }), expected_original_uri: mock_response(headers={}, text="", status=404, url=expected_original_uri) } mh = mock_httpcache(cachedict) mr = memento_resource_factory(urim, mh) ores = OriginalResource(mr, mh) self.assertEqual(ores.domain, "example.com") self.assertEqual(ores.uri, "http://example.com/something") self.assertEqual(ores.link_status, "Rotten")
def test_meta_redirect(self): urim = "https://archive-example.org/web/20180401102030/http://example.com/redirpage" redirurim = "https://archive-example.org/web/20180308084654/http://example.com/testpage" metaredirecthtml = """<html> <meta http-equiv="refresh" content="0; URL='{}'"/> </html>""".format(redirurim) expected_content = "<html><body>somecontent</body></html>" expected_raw_content = expected_content expected_original_uri = "http://example.com/redirpage" expected_urig = "https://archive-example.org/web/timegate/http://example.com/redirpage" redir_expected_original_uri = "http://example.com/testpage" redir_expected_urig = "https://archive-example.org/web/timegate/http://example.com/testpage" redirurim_raw = "https://archive-example.org/web/20180308084654id_/http://example.com/testpage" expected_raw_content = "<html><body>raw content</body></html>" cachedict = { urim: mock_response(headers={ 'content-type': 'text/html', 'memento-datetime': "Sat, 02 Feb 2008 06:29:13 GMT", 'link': """<{}>; rel="original", <{}>; rel="timegate", <http://myarchive.org/timemap/http://example.com/something>; rel="timemap", <{}>; rel="memento" """.format(expected_original_uri, expected_urig, urim) }, text=metaredirecthtml, content=metaredirecthtml, status=200, url=urim, links={ "original": { "url": expected_original_uri }, "timegate": { "url": expected_urig } }), redirurim: mock_response(headers={ 'content-type': 'text/html', 'memento-datetime': "Sat, 02 Feb 2008 06:29:13 GMT", 'link': """<{}>; rel="original", <{}>; rel="timegate", <http://myarchive.org/timemap/http://example.com/something>; rel="timemap", <{}>; rel="memento" """.format(redir_expected_original_uri, redir_expected_urig, urim) }, text=expected_content, content=expected_content, status=200, url=redirurim, links={ "original": { "url": redir_expected_original_uri }, "timegate": { "url": redir_expected_urig } }), redirurim_raw: mock_response(headers={ 'content-type': 'text/html', }, text=expected_raw_content, content=expected_raw_content, status=200, url=redirurim_raw) } mh = mock_httpcache(cachedict) mr = memento_resource_factory(urim, mh) expected_mdt = datetime.strptime("Sat, 02 Feb 2008 06:29:13 GMT", "%a, %d %b %Y %H:%M:%S GMT") self.assertEqual(type(mr), WaybackMemento) self.assertEqual(mr.memento_datetime, expected_mdt) self.assertEqual(mr.timegate, redir_expected_urig) self.assertEqual(mr.original_uri, redir_expected_original_uri) self.assertEqual(mr.content, expected_content) self.assertEqual(mr.raw_content, expected_raw_content)