コード例 #1
0
ファイル: memento.py プロジェクト: databill86/MementoEmbed
def seeddata(urim, preferences):

    uricache = getURICache()

    httpcache = ManagedSession(
        timeout=current_app.config['REQUEST_TIMEOUT_FLOAT'],
        user_agent=__useragent__,
        starting_uri=urim,
        uricache=uricache)

    memento = memento_resource_factory(urim, httpcache)

    sr = SeedResource(memento, httpcache)

    output = {}

    output['urim'] = urim
    output['generation-time'] = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
    output['timemap'] = sr.urit
    output['original-url'] = sr.urir
    output['memento-count'] = sr.mementocount()
    output['first-memento-datetime'] = sr.first_mdt().strftime(
        "%Y-%m-%dT%H:%M:%SZ")
    output['first-urim'] = sr.first_urim()
    output['last-memento-datetime'] = sr.last_mdt().strftime(
        "%Y-%m-%dT%H:%M:%SZ")
    output['last-urim'] = sr.last_urim()
    output['metadata'] = sr.seed_metadata()

    response = make_response(json.dumps(output, indent=4))
    response.headers['Content-Type'] = 'application/json'

    return response, 200
コード例 #2
0
def sentencerank(urim, preferences):

    output = {}

    httpcache = getURICache(urim)

    memento = memento_resource_factory(urim, httpcache)

    output['urim'] = urim
    output['generation-time'] = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
    scoredata = {}

    if preferences["algorithm"] == "readability/lede3":
        scoredata = get_sentence_scores_by_readability_and_lede3(memento.raw_content)
    elif preferences["algorithm"] == "readability/textrank":
        scoredata = get_sentence_scores_by_readability_and_textrank(memento.raw_content)
    elif preferences["algorithm"] == "justext/textrank":
        scoredata = get_sentence_scores_by_just_textrank(memento.raw_content)
    else:
        scoredata = get_sentence_scores_by_readability_and_lede3(memento.raw_content)
    
    output.update(scoredata)

    response = make_response(json.dumps(output, indent=4))
    response.headers['Content-Type'] = 'application/json'
    response.headers['Preference-Applied'] = \
        "algorithm={}".format(preferences['algorithm'])

    return response, 200
コード例 #3
0
def archivedata(urim, preferences):

    httpcache = getURICache(urim)

    # TODO: only here because we need to detect NotAMemento, need a better solution
    memento = memento_resource_factory(urim, httpcache) 

    archive = ArchiveResource(urim, httpcache)

    if preferences['datauri_favicon'].lower() == 'yes':
        archive_favicon = convert_imageuri_to_pngdata_uri(
            archive.favicon, httpcache, 16, 16
        )
    else:
        archive_favicon = archive.favicon

    output = {}

    output['urim'] = urim
    output['generation-time'] = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")

    output['archive-uri'] = archive.home_uri
    output['archive-name'] = archive.name
    output['archive-favicon'] = archive_favicon
    output['archive-collection-id'] = archive.collection_id
    output['archive-collection-name'] = archive.collection_name
    output['archive-collection-uri'] = archive.collection_uri

    response = make_response(json.dumps(output, indent=4))
    response.headers['Content-Type'] = 'application/json'

    response.headers['Preference-Applied'] = \
        "datauri_favicon={}".format(preferences['datauri_favicon'])

    return response, 200
コード例 #4
0
ファイル: memento.py プロジェクト: databill86/MementoEmbed
def originaldata(urim, preferences):

    output = {}

    httpcache = ManagedSession(
        timeout=current_app.config['REQUEST_TIMEOUT_FLOAT'],
        user_agent=__useragent__,
        starting_uri=urim,
        uricache=getURICache())

    memento = memento_resource_factory(urim, httpcache)

    originalresource = OriginalResource(memento, httpcache)

    if preferences['datauri_favicon'].lower() == 'yes':

        try:
            original_favicon = convert_imageuri_to_pngdata_uri(
                originalresource.favicon, httpcache, 16, 16)
        except ValueError as e:

            module_logger.exception(
                "an error occurred while generating a data URI for an original resource favicon"
            )

            if str(e) == "not enough image data":
                original_favicon = ""

            else:
                raise e

    else:
        original_favicon = originalresource.favicon

    output['urim'] = urim
    output['generation-time'] = datetime.utcnow().strftime(
        "%Y-%m-%dT%H:%M:%SZ")

    output['original-uri'] = originalresource.uri
    output['original-domain'] = originalresource.domain
    output['original-favicon'] = original_favicon
    output['original-linkstatus'] = originalresource.link_status

    response = make_response(json.dumps(output, indent=4))
    response.headers['Content-Type'] = 'application/json'

    response.headers['Preference-Applied'] = \
        "datauri_favicon={}".format(preferences['datauri_favicon'])

    return response, 200
コード例 #5
0
ファイル: memento.py プロジェクト: databill86/MementoEmbed
def imagedata(urim, preferences):

    httpcache = ManagedSession(
        timeout=current_app.config['REQUEST_TIMEOUT_FLOAT'],
        user_agent=__useragent__,
        starting_uri=urim,
        uricache=getURICache())

    memento = memento_resource_factory(urim, httpcache)

    output = {}

    output['urim'] = urim
    output['processed urim'] = memento.im_urim
    output['generation-time'] = datetime.utcnow().strftime(
        "%Y-%m-%dT%H:%M:%SZ")
    output['images'] = generate_images_and_scores(memento.im_urim, httpcache)

    scorelist = []
    output["ranked images"] = []

    for imageuri in output['images']:
        if 'calculated score' in output['images'][imageuri]:
            scorelist.append(
                (output['images'][imageuri]["calculated score"], imageuri))

    for item in sorted(scorelist, reverse=True):
        output["ranked images"].append(item[1])

    if len(output["ranked images"]) == 0:
        output['images'] = generate_images_and_scores(memento.urim, httpcache)

        scorelist = []
        output["ranked images"] = []

        for imageuri in output['images']:
            if 'calculated score' in output['images'][imageuri]:
                scorelist.append(
                    (output['images'][imageuri]["calculated score"], imageuri))

        for item in sorted(scorelist, reverse=True):
            output["ranked images"].append(item[1])

    response = make_response(json.dumps(output, indent=4))
    response.headers['Content-Type'] = 'application/json'

    return response, 200
コード例 #6
0
def paragraphrank(urim, preferences):

    output = {}

    httpcache = getURICache(urim)

    memento = memento_resource_factory(urim, httpcache)

    output['urim'] = urim
    output['generation-time'] = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")

    scoredata = get_section_scores_by_readability(memento.raw_content)
    output.update(scoredata)

    response = make_response(json.dumps(output, indent=4))
    response.headers['Content-Type'] = 'application/json'
    return response, 200
コード例 #7
0
def contentdata(urim, preferences):

    output = {}

    httpcache = getURICache(urim)

    memento = memento_resource_factory(urim, httpcache)

    output['urim'] = urim
    output['generation-time'] = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")

    output['title'] = extract_title(memento.raw_content)
    output['snippet'] = extract_text_snippet(memento.raw_content)
    output['memento-datetime'] = memento.memento_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
    
    response = make_response(json.dumps(output, indent=4))
    response.headers['Content-Type'] = 'application/json'
    return response, 200
コード例 #8
0
def imagedata(urim, preferences):

    httpcache = getURICache(urim)

    memento = memento_resource_factory(urim, httpcache)

    output = {}

    output['urim'] = urim
    output['processed urim'] = memento.im_urim
    output['generation-time'] = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
    output['images'] = generate_images_and_scores(memento.im_urim, httpcache)

    scorelist = []
    output["ranked images"] = []

    module_logger.debug("images data structure: {}".format(pprint.pformat(output['images'], indent=4)))

    for imageuri in output['images']:
        module_logger.debug("looking for calculated score in imageuri {}".format(imageuri))
        if output['images'][imageuri] is not None:
            if 'calculated score' in output['images'][imageuri]:
                scorelist.append( (output['images'][imageuri]["calculated score"], imageuri) )

    for item in sorted(scorelist, reverse=True):
        output["ranked images"].append(item[1])

    if len(output["ranked images"]) == 0:
        output['images'] = generate_images_and_scores(memento.urim, httpcache)

        scorelist = []
        output["ranked images"] = []

        for imageuri in output['images']:
            if 'calculated score' in output['images'][imageuri]:
                scorelist.append( (output['images'][imageuri]["calculated score"], imageuri) )

        for item in sorted(scorelist, reverse=True):
            output["ranked images"].append(item[1])

    response = make_response(json.dumps(output, indent=4))
    response.headers['Content-Type'] = 'application/json'

    return response, 200
コード例 #9
0
ファイル: memento.py プロジェクト: databill86/MementoEmbed
def bestimage(urim, preferences):

    httpcache = ManagedSession(
        timeout=current_app.config['REQUEST_TIMEOUT_FLOAT'],
        user_agent=__useragent__,
        starting_uri=urim,
        uricache=getURICache())

    memento = memento_resource_factory(urim, httpcache)

    module_logger.debug("trying to find best image with {}".format(
        memento.im_urim))
    best_image_uri = get_best_image(memento.im_urim, httpcache,
                                    current_app.config['DEFAULT_IMAGE_URI'])

    if best_image_uri == current_app.config['DEFAULT_IMAGE_URI']:
        module_logger.debug(
            "got back a blank image, trying again with {}".format(
                memento.urim))
        best_image_uri = get_best_image(
            memento.urim, httpcache, current_app.config['DEFAULT_IMAGE_URI'])

    if preferences['datauri_image'].lower() == 'yes':
        if best_image_uri[0:5] != 'data:':
            best_image_uri = convert_imageuri_to_pngdata_uri(
                best_image_uri, httpcache, 96)

    output = {}

    output['urim'] = urim
    output['best-image-uri'] = best_image_uri
    output['generation-time'] = datetime.utcnow().strftime(
        "%Y-%m-%dT%H:%M:%SZ")

    response = make_response(json.dumps(output, indent=4))
    response.headers['Content-Type'] = 'application/json'
    response.headers['Preference-Applied'] = \
        "datauri_image={}".format(preferences['datauri_image'])

    return response, 200
コード例 #10
0
ファイル: memento.py プロジェクト: databill86/MementoEmbed
def paragraphrank(urim, preferences):

    output = {}

    httpcache = ManagedSession(
        timeout=current_app.config['REQUEST_TIMEOUT_FLOAT'],
        user_agent=__useragent__,
        starting_uri=urim,
        uricache=getURICache())

    memento = memento_resource_factory(urim, httpcache)

    output['urim'] = urim
    output['generation-time'] = datetime.utcnow().strftime(
        "%Y-%m-%dT%H:%M:%SZ")

    scoredata = get_section_scores_by_readability(memento.raw_content)
    output.update(scoredata)

    response = make_response(json.dumps(output, indent=4))
    response.headers['Content-Type'] = 'application/json'
    return response, 200
コード例 #11
0
def seeddata(urim, preferences):

    httpcache = getURICache(urim)

    memento = memento_resource_factory(urim, httpcache)

    sr = SeedResource(memento, httpcache)

    output = {}

    output['urim'] = urim
    output['generation-time'] = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
    output['timemap'] = sr.urit
    output['original-url'] = sr.urir
    output['memento-count'] = sr.mementocount()

    if sr.mementocount() is None:
        output['seeddata-error'] = "There was an issue processing the TimeMap discovered at {}".format(sr.urit)

    if sr.first_mdt() is not None:
        output['first-memento-datetime'] = sr.first_mdt().strftime("%Y-%m-%dT%H:%M:%SZ")
    else:
        output['first-memento-datetime'] = None
    
    output['first-urim'] = sr.first_urim()

    if sr.last_mdt() is not None:
        output['last-memento-datetime'] = sr.last_mdt().strftime("%Y-%m-%dT%H:%M:%SZ")
    else:
        output['last-memento-datetime'] = None

    output['last-urim'] = sr.last_urim()
    output['metadata'] = sr.seed_metadata()

    response = make_response(json.dumps(output, indent=4))
    response.headers['Content-Type'] = 'application/json'

    return response, 200
コード例 #12
0
ファイル: memento.py プロジェクト: databill86/MementoEmbed
def contentdata(urim, preferences):

    output = {}

    httpcache = ManagedSession(
        timeout=current_app.config['REQUEST_TIMEOUT_FLOAT'],
        user_agent=__useragent__,
        starting_uri=urim,
        uricache=getURICache())

    memento = memento_resource_factory(urim, httpcache)

    output['urim'] = urim
    output['generation-time'] = datetime.utcnow().strftime(
        "%Y-%m-%dT%H:%M:%SZ")

    output['title'] = extract_title(memento.raw_content)
    output['snippet'] = extract_text_snippet(memento.raw_content)
    output['memento-datetime'] = memento.memento_datetime.strftime(
        "%Y-%m-%dT%H:%M:%SZ")

    response = make_response(json.dumps(output, indent=4))
    response.headers['Content-Type'] = 'application/json'
    return response, 200
コード例 #13
0
    def test_archiveiscase_datetime_in_uri(self):

        urim = "http://archive.is/20130508132946/http://flexispy.com/"
        zipurim = "http://archive.is/download/pSSpa.zip"
        expected_original_uri = "http://flexispy.com/"
        expected_urig = "http://archive.is/timegate/http://flexispy.com/"

        with open("{}/samples/archive.is-1.html".format(testdir), 'rb') as f:
            expected_content = f.read()

        with open("{}/samples/archive.is-1.raw.zip".format(testdir),
                  'rb') as f:
            zip_content = f.read()

            zf = zipfile.ZipFile(f)
            expected_raw_content = zf.read("index.html")

        cachedict = {
            urim:
            mock_response(headers={
                'content-type':
                'text/html',
                'memento-datetime':
                "Sat, 02 Feb 2008 06:29:13 GMT",
                'link':
                """<{}>; rel="original", 
                            <{}>; rel="timegate",
                            <http://myarchive.org/timemap/http://example.com/something>; rel="timemap",
                            <{}>; rel="memento"
                            """.format(expected_original_uri, expected_urig,
                                       urim)
            },
                          text=expected_content,
                          status=200,
                          url=urim,
                          links={
                              "original": {
                                  "url": expected_original_uri
                              },
                              "timegate": {
                                  "url": expected_urig
                              }
                          }),
            "http://archive.is/20130508132946id_/http://flexispy.com/":
            mock_response(
                headers={},
                text="",
                status=404,
                url="http://archive.is/20130508132946id_/http://flexispy.com/"
            ),
            zipurim:
            mock_response(headers={
                'content-type': 'text/html',
            },
                          text="",
                          content=zip_content,
                          status=200,
                          url=zipurim)
        }

        mh = mock_httpcache(cachedict)

        mr = memento_resource_factory(urim, mh)

        expected_mdt = datetime.strptime("Sat, 02 Feb 2008 06:29:13 GMT",
                                         "%a, %d %b %Y %H:%M:%S GMT")

        self.maxDiff = None

        self.assertEqual(type(mr), ArchiveIsMemento)

        self.assertEqual(mr.memento_datetime, expected_mdt)
        self.assertEqual(mr.timegate, expected_urig)
        self.assertEqual(mr.original_uri, expected_original_uri)
        self.assertEqual(mr.content, expected_content)
        self.assertEqual(mr.raw_content, expected_raw_content)
コード例 #14
0
    def test_waybackcase(self):

        urim = "http://myarchive.org/memento/20080202062913/http://example.com/something"
        raw_urim = "http://myarchive.org/memento/20080202062913id_/http://example.com/something"
        expected_urig = "http://myarchive.org/timegate/http://example.com/something"
        expected_original_uri = "http://example.com/something"

        expected_content = """
        <html>
            <head>
                <title>Is this a good title?</title>
            </head>
            <body>
                <!-- ARCHIVE SPECIFIC STUFF -->
                Is this good text?
            </body>
        </html>"""

        expected_raw_content = """
        <html>
            <head>
                <title>Is this a good title?</title>
            </head>
            <body>
                Is this good text?
            </body>
        </html>"""

        cachedict = {
            urim:
            mock_response(headers={
                'content-type':
                'text/html',
                'memento-datetime':
                "Sat, 02 Feb 2008 06:29:13 GMT",
                'link':
                """<{}>; rel="original", 
                            <{}>; rel="timegate",
                            <http://myarchive.org/timemap/http://example.com/something>; rel="timemap",
                            <{}>; rel="memento"
                            """.format(expected_original_uri, expected_urig,
                                       urim)
            },
                          text=expected_content,
                          status=200,
                          url=urim,
                          links={
                              "original": {
                                  "url": expected_original_uri
                              },
                              "timegate": {
                                  "url": expected_urig
                              }
                          }),
            raw_urim:
            mock_response(headers={'content-type': 'text/html'},
                          text=expected_raw_content,
                          status=200,
                          url=raw_urim)
        }

        mh = mock_httpcache(cachedict)

        mr = memento_resource_factory(urim, mh)

        expected_mdt = datetime.strptime("Sat, 02 Feb 2008 06:29:13 GMT",
                                         "%a, %d %b %Y %H:%M:%S GMT")

        self.assertEqual(type(mr), WaybackMemento)

        self.assertEqual(mr.memento_datetime, expected_mdt)
        self.assertEqual(mr.timegate, expected_urig)
        self.assertEqual(mr.original_uri, expected_original_uri)
        self.assertEqual(mr.content, expected_content)
        self.assertEqual(mr.raw_content, expected_raw_content)
コード例 #15
0
    def test_permacc_hashstyle_uris(self):

        urim = "http://perma.cc/RZP7-3P4P"
        expected_original_uri = "http://www.environment.gov.au/minister/hunt/2014/mr20141215a.html"
        expected_urim = "https://perma-archives.org/warc/20151028031045/http://www.environment.gov.au/minister/hunt/2014/mr20141215a.html"
        expected_raw_uri = "https://perma-archives.org/warc/20151028031045id_/http://www.environment.gov.au/minister/hunt/2014/mr20141215a.html"
        expected_urig = "https://perma-archives.org/warc/timegate/http://www.environment.gov.au/minister/hunt/2014/mr20141215a.html"

        expected_content = "hi"
        expected_raw_content = "hi there"

        cachedict = {
            urim:
                mock_response(
                    headers = {
                        'content-type': 'text/html',
                        'memento-datetime': "Sat, 02 Feb 2008 06:29:13 GMT",
                        'link': """<{}>; rel="original", 
                            <{}>; rel="timegate",
                            <http://myarchive.org/timemap/http://example.com/something>; rel="timemap",
                            <{}>; rel="memento"
                            """.format(expected_original_uri, expected_urig, urim)
                    },
                    text = expected_content,
                    status=200,
                    url = urim,
                    links = {
                        "original": {
                            "url": expected_original_uri
                        },
                        "timegate": {
                            "url": expected_urig
                        }
                    }
                ),
            expected_raw_uri:
                mock_response(
                    headers = {
                        'content-type': 'text/html',
                        'memento-datetime': "Sat, 02 Feb 2008 06:29:13 GMT",
                        'link': """<{}>; rel="original", 
                            <{}>; rel="timegate",
                            <http://myarchive.org/timemap/http://example.com/something>; rel="timemap",
                            <{}>; rel="memento"
                            """.format(expected_original_uri, expected_urig, expected_urim)
                    },
                    text = expected_raw_content,
                    status = 200,
                    url = expected_raw_uri,
                    links = {
                        "original": {
                            "url": expected_original_uri
                        },
                        "timegate": {
                            "url": expected_urig
                        }
                    }
                ),
            expected_urig: # requests follows all redirects, so we present the result at the end of the chain
                mock_response(
                    headers = {
                        'content-type': 'text/html',
                        'memento-datetime': "Sat, 02 Feb 2008 06:29:13 GMT",
                        'link': """<{}>; rel="original", 
                            <{}>; rel="timegate",
                            <http://myarchive.org/timemap/http://example.com/something>; rel="timemap",
                            <{}>; rel="memento"
                            """.format(expected_original_uri, expected_urig, expected_urim)
                     },
                    text = expected_content,
                    status = 200, # after following redirects
                    url = expected_urim,
                    links = {
                        "original": {
                            "url": expected_original_uri
                        },
                        "timegate": {
                            "url": expected_urig
                        }
                    }
                ),
            expected_urim:
                mock_response(
                    headers = {
                        'content-type': 'text/html',
                        'memento-datetime': "Sat, 02 Feb 2008 06:29:13 GMT",
                        'link': """<{}>; rel="original", 
                            <{}>; rel="timegate",
                            <http://myarchive.org/timemap/http://example.com/something>; rel="timemap",
                            <{}>; rel="memento"
                            """.format(expected_original_uri, expected_urig, expected_urim)
                     },
                    text = expected_content,
                    status = 200, # after following redirects
                    url = expected_urim,
                    links = {
                        "original": {
                            "url": expected_original_uri
                        },
                        "timegate": {
                            "url": expected_urig
                        }
                    }
                )
        }

        mh = mock_httpcache(cachedict)

        mr = memento_resource_factory(urim, mh)

        expected_mdt = datetime.strptime("Sat, 02 Feb 2008 06:29:13 GMT",
                                         "%a, %d %b %Y %H:%M:%S GMT")

        self.maxDiff = None

        self.assertEqual(type(mr), WaybackMemento)

        self.assertEqual(mr.memento_datetime, expected_mdt)
        self.assertEqual(mr.timegate, expected_urig)
        self.assertEqual(mr.original_uri, expected_original_uri)
        self.assertEqual(mr.content, expected_content)
        self.assertEqual(mr.raw_content, expected_raw_content)
コード例 #16
0
    def test_simplecase(self):

        urim = "http://myarchive.org/memento/http://example.com/something"
        expected_urig = "http://myarchive.org/timegate/http://example.com/something"
        expected_original_uri = "http://example.com/something"

        expected_content = """
        <html>
            <head>
                <title>Is this a good title?</title>
            </head>
            <body>
                Is this good text?
            </body>
        </html>"""

        cachedict = {
            urim:
                mock_response(
                    headers = {
                        'content-type': 'text/html',
                        'memento-datetime': "Fri, 22 Jun 2018 21:16:36 GMT",
                        'link': """<{}>; rel="original", 
                            <{}>; rel="timegate",
                            <http://myarchive.org/timemap/http://example.com/something>; rel="timemap",
                            <{}>; rel="memento"
                            """.format(expected_original_uri, expected_urig, urim)
                    },
                    text = expected_content,
                    status=200,
                    url = urim,
                    links = {
                        "original": {
                            "url": expected_original_uri
                        },
                        "timegate": {
                            "url": expected_urig
                        }
                    }
                ),
            expected_urig: # requests follows all redirects, so we present the result at the end of the chain
                mock_response(
                    headers = {
                        'content-type': 'text/html',
                        'memento-datetime': "Fri, 22 Jun 2018 21:16:36 GMT",
                        'link': """<{}>; rel="original", 
                            <{}>; rel="timegate",
                            <http://myarchive.org/timemap/http://example.com/something>; rel="timemap",
                            <{}>; rel="memento"
                            """.format(expected_original_uri, expected_urig, urim)
                    },
                    text = expected_content,
                    status=200,
                    url = urim,
                    links = {
                        "original": {
                            "url": expected_original_uri
                        },
                        "timegate": {
                            "url": expected_urig
                        }
                    }
                )
        }

        mh = mock_httpcache(cachedict)

        mr = memento_resource_factory(urim, mh)

        expected_mdt = datetime.strptime("Fri, 22 Jun 2018 21:16:36 GMT",
                                         "%a, %d %b %Y %H:%M:%S GMT")

        self.assertEqual(type(mr), MementoResource)

        self.assertEqual(mr.memento_datetime, expected_mdt)
        self.assertEqual(mr.timegate, expected_urig)
        self.assertEqual(mr.original_uri, expected_original_uri)
        self.assertEqual(mr.content, expected_content)
        self.assertEqual(mr.raw_content, expected_content)
コード例 #17
0
    def test_favicon_from_html(self):

        urim = "http://myarchive.org/memento/http://example.com/something"
        expected_urig = "http://myarchive.org/timegate/http://example.com/something"
        expected_original_uri = "http://example.com/something"
        expected_favicon = "http://myarchive.org/memento/http://example.com/content/favicon.ico"
        original_favicon = "http://example.com/content/favicon.ico"
        favicon_urig = "http://myarchive.org/timegate/http://example.com/favicon.ico"

        expected_content = """
        <html>
            <head>
                <title>Is this a good title?</title>
                <link rel="icon" href="{}" >
            </head>
            <body>
                Is this good text?
            </body>
        </html>""".format(original_favicon)

        cachedict = {
            urim:
            mock_response(headers={
                'content-type':
                'text/html',
                'memento-datetime':
                "Fri, 22 Jun 2018 21:16:36 GMT",
                'link':
                """<{}>; rel="original", 
                            <{}>; rel="timegate",
                            <http://myarchive.org/timemap/http://example.com/something>; rel="timemap",
                            <{}>; rel="memento"
                            """.format(expected_original_uri, expected_urig,
                                       urim)
            },
                          text=expected_content,
                          status=200,
                          url=urim,
                          links={
                              "original": {
                                  "url": expected_original_uri
                              },
                              "timegate": {
                                  "url": expected_urig
                              }
                          }),
            expected_urig:
            mock_response(headers={
                'content-type':
                'text/html',
                'memento-datetime':
                "Fri, 22 Jun 2018 21:16:36 GMT",
                'link':
                """<{}>; rel="original", 
                            <{}>; rel="timegate",
                            <http://myarchive.org/timemap/http://example.com/something>; rel="timemap",
                            <{}>; rel="memento"
                            """.format(expected_original_uri, expected_urig,
                                       urim)
            },
                          text=expected_content,
                          status=200,
                          url=urim,
                          links={
                              "original": {
                                  "url": expected_original_uri
                              },
                              "timegate": {
                                  "url": expected_urig
                              }
                          }),
            original_favicon:
            mock_response(headers={
                'content-type': 'image/',
            },
                          text=expected_content,
                          status=200,
                          url=expected_favicon),
            expected_original_uri:
            mock_response(headers={
                'content-type': 'text/html',
            },
                          text="",
                          status=200,
                          url=expected_original_uri),
            "http://myarchive.org/timegate/http://example.com/content/favicon.ico":
            mock_response(headers={"location": expected_favicon},
                          text="",
                          status=200,
                          url=expected_favicon),
            favicon_urig:
            mock_response(headers={'content-type': 'image/'},
                          text="a",
                          status=200,
                          url=expected_favicon),
            expected_favicon:
            mock_response(headers={'content-type': 'image/'},
                          text="a",
                          status=200,
                          url=expected_favicon)
        }

        mh = mock_httpcache(cachedict)

        mr = memento_resource_factory(urim, mh)

        ores = OriginalResource(mr, mh)

        self.assertEqual(ores.domain, "example.com")
        self.assertEqual(ores.uri, "http://example.com/something")
        self.assertEqual(ores.link_status, "Live")
        self.assertEqual(ores.favicon, expected_favicon)
コード例 #18
0
    def test_archiveiscase(self):

        urim = "http://archive.is/abcd1234"
        zipurim = "http://archive.is/download/abcd1234.zip"
        expected_original_uri = "http://example.com/something"
        expected_urig = "http://myarchive.org/timegate/http://example.com/something"

        expected_raw_content = """
        <html>
            <head>
                <title>Is this a good title?</title>
            </head>
            <body>
                Is this good text?
            </body>
        </html>"""

        expected_content = """
        <html>
            <head>
                <title>ARCHIVED: Is this a good title?</title>
            </head>
            <body>
                <p>Some Archive-specific stuff here</p>
                <div id="SOLID">{}</div>
            </body>
        </html>""".format(expected_raw_content)

        file_like_object = io.BytesIO()
        zf = zipfile.ZipFile(file_like_object, mode='w')

        zf.writestr('index.html', expected_raw_content)
        zf.close()

        zip_content = file_like_object.getvalue()

        cachedict = {
            urim:
            mock_response(headers={
                'content-type':
                'text/html',
                'memento-datetime':
                "Sat, 02 Feb 2008 06:29:13 GMT",
                'link':
                """<{}>; rel="original", 
                            <{}>; rel="timegate",
                            <http://myarchive.org/timemap/http://example.com/something>; rel="timemap",
                            <{}>; rel="memento"
                            """.format(expected_original_uri, expected_urig,
                                       urim)
            },
                          text=expected_content,
                          status=200,
                          url=urim,
                          links={
                              "original": {
                                  "url": expected_original_uri
                              },
                              "timegate": {
                                  "url": expected_urig
                              }
                          }),
            zipurim:
            mock_response(headers={'content-type': 'text/html'},
                          text="",
                          content=zip_content,
                          status=200,
                          url=zipurim)
        }

        mh = mock_httpcache(cachedict)

        mr = memento_resource_factory(urim, mh)

        expected_mdt = datetime.strptime("Sat, 02 Feb 2008 06:29:13 GMT",
                                         "%a, %d %b %Y %H:%M:%S GMT")

        self.maxDiff = None

        self.assertEqual(type(mr), ArchiveIsMemento)

        self.assertEqual(mr.memento_datetime, expected_mdt)
        self.assertEqual(mr.timegate, expected_urig)
        self.assertEqual(mr.original_uri, expected_original_uri)
        self.assertEqual(mr.content, expected_content)
        self.assertEqual(mr.raw_content,
                         bytes(expected_raw_content.encode('utf-8')))
コード例 #19
0
    def test_imfcase(self):

        urim = "http://myarchive.org/memento/notraw/http://example.com/something"
        raw_urim = "http://myarchive.org/memento/raw/http://example.com/something"
        expected_urig = "http://myarchive.org/timegate/http://example.com/something"
        expected_original_uri = "http://example.com/something"

        expected_content = """
        <html>
            <head>
                <title>ARCHIVED: Is this a good title?</title>
            </head>
            <body>
                <p>Some Archive-specific stuff here</p>
                <iframe id="theWebpage" src="{}"></iframe>
            </body>
        </html>""".format(raw_urim)

        expected_raw_content = """
        <html>
            <head>
                <title>Is this a good title?</title>
            </head>
            <body>
                Is this good text?
            </body>
        </html>"""

        cachedict = {
            urim:
            mock_response(headers={
                'content-type':
                'text/html',
                'memento-datetime':
                "Sat, 02 Feb 2008 06:29:13 GMT",
                'link':
                """<{}>; rel="original", 
                            <{}>; rel="timegate",
                            <http://myarchive.org/timemap/http://example.com/something>; rel="timemap",
                            <{}>; rel="memento"
                            """.format(expected_original_uri, expected_urig,
                                       urim)
            },
                          text=expected_content,
                          status=200,
                          url=urim,
                          links={
                              "original": {
                                  "url": expected_original_uri
                              },
                              "timegate": {
                                  "url": expected_urig
                              }
                          }),
            raw_urim:
            mock_response(headers={'content-type': 'text/html'},
                          text=expected_raw_content,
                          status=200,
                          url=raw_urim)
        }

        mh = mock_httpcache(cachedict)

        mr = memento_resource_factory(urim, mh)

        expected_mdt = datetime.strptime("Sat, 02 Feb 2008 06:29:13 GMT",
                                         "%a, %d %b %Y %H:%M:%S GMT")

        self.assertEqual(type(mr), IMFMemento)

        self.assertEqual(mr.memento_datetime, expected_mdt)
        self.assertEqual(mr.timegate, expected_urig)
        self.assertEqual(mr.original_uri, expected_original_uri)
        self.assertEqual(mr.content, expected_content)
        self.assertEqual(mr.raw_content, expected_raw_content)
コード例 #20
0
    def test_simplecase_rotten_resource(self):

        urim = "http://myarchive.org/memento/http://example.com/something"
        expected_urig = "http://myarchive.org/timegate/http://example.com/something"
        expected_original_uri = "http://example.com/something"

        expected_content = """
        <html>
            <head>
                <title>Is this a good title?</title>
            </head>
            <body>
                Is this good text?
            </body>
        </html>"""

        cachedict = {
            urim:
            mock_response(headers={
                'content-type':
                'text/html',
                'memento-datetime':
                "Fri, 22 Jun 2018 21:16:36 GMT",
                'link':
                """<{}>; rel="original", 
                            <{}>; rel="timegate",
                            <http://myarchive.org/timemap/http://example.com/something>; rel="timemap",
                            <{}>; rel="memento"
                            """.format(expected_original_uri, expected_urig,
                                       urim)
            },
                          text=expected_content,
                          status=200,
                          url=urim,
                          links={
                              "original": {
                                  "url": expected_original_uri
                              },
                              "timegate": {
                                  "url": expected_urig
                              }
                          }),
            expected_urig:
            mock_response(headers={
                'content-type':
                'text/html',
                'memento-datetime':
                "Fri, 22 Jun 2018 21:16:36 GMT",
                'link':
                """<{}>; rel="original", 
                            <{}>; rel="timegate",
                            <http://myarchive.org/timemap/http://example.com/something>; rel="timemap",
                            <{}>; rel="memento"
                            """.format(expected_original_uri, expected_urig,
                                       urim)
            },
                          text=expected_content,
                          status=200,
                          url=urim,
                          links={
                              "original": {
                                  "url": expected_original_uri
                              },
                              "timegate": {
                                  "url": expected_urig
                              }
                          }),
            expected_original_uri:
            mock_response(headers={},
                          text="",
                          status=404,
                          url=expected_original_uri)
        }

        mh = mock_httpcache(cachedict)

        mr = memento_resource_factory(urim, mh)

        ores = OriginalResource(mr, mh)

        self.assertEqual(ores.domain, "example.com")
        self.assertEqual(ores.uri, "http://example.com/something")
        self.assertEqual(ores.link_status, "Rotten")
コード例 #21
0
    def test_meta_redirect(self):

        urim = "https://archive-example.org/web/20180401102030/http://example.com/redirpage"
        redirurim = "https://archive-example.org/web/20180308084654/http://example.com/testpage"

        metaredirecthtml = """<html>
<meta http-equiv="refresh" content="0; URL='{}'"/>
</html>""".format(redirurim)

        expected_content = "<html><body>somecontent</body></html>"
        expected_raw_content = expected_content

        expected_original_uri = "http://example.com/redirpage"
        expected_urig = "https://archive-example.org/web/timegate/http://example.com/redirpage"

        redir_expected_original_uri = "http://example.com/testpage"
        redir_expected_urig = "https://archive-example.org/web/timegate/http://example.com/testpage"

        redirurim_raw = "https://archive-example.org/web/20180308084654id_/http://example.com/testpage"
        expected_raw_content = "<html><body>raw content</body></html>"

        cachedict = {
            urim:
            mock_response(headers={
                'content-type':
                'text/html',
                'memento-datetime':
                "Sat, 02 Feb 2008 06:29:13 GMT",
                'link':
                """<{}>; rel="original", 
                            <{}>; rel="timegate",
                            <http://myarchive.org/timemap/http://example.com/something>; rel="timemap",
                            <{}>; rel="memento"
                            """.format(expected_original_uri, expected_urig,
                                       urim)
            },
                          text=metaredirecthtml,
                          content=metaredirecthtml,
                          status=200,
                          url=urim,
                          links={
                              "original": {
                                  "url": expected_original_uri
                              },
                              "timegate": {
                                  "url": expected_urig
                              }
                          }),
            redirurim:
            mock_response(headers={
                'content-type':
                'text/html',
                'memento-datetime':
                "Sat, 02 Feb 2008 06:29:13 GMT",
                'link':
                """<{}>; rel="original", 
                            <{}>; rel="timegate",
                            <http://myarchive.org/timemap/http://example.com/something>; rel="timemap",
                            <{}>; rel="memento"
                            """.format(redir_expected_original_uri,
                                       redir_expected_urig, urim)
            },
                          text=expected_content,
                          content=expected_content,
                          status=200,
                          url=redirurim,
                          links={
                              "original": {
                                  "url": redir_expected_original_uri
                              },
                              "timegate": {
                                  "url": redir_expected_urig
                              }
                          }),
            redirurim_raw:
            mock_response(headers={
                'content-type': 'text/html',
            },
                          text=expected_raw_content,
                          content=expected_raw_content,
                          status=200,
                          url=redirurim_raw)
        }

        mh = mock_httpcache(cachedict)

        mr = memento_resource_factory(urim, mh)

        expected_mdt = datetime.strptime("Sat, 02 Feb 2008 06:29:13 GMT",
                                         "%a, %d %b %Y %H:%M:%S GMT")

        self.assertEqual(type(mr), WaybackMemento)

        self.assertEqual(mr.memento_datetime, expected_mdt)
        self.assertEqual(mr.timegate, redir_expected_urig)
        self.assertEqual(mr.original_uri, redir_expected_original_uri)
        self.assertEqual(mr.content, expected_content)
        self.assertEqual(mr.raw_content, expected_raw_content)