Beispiel #1
0
    def fetch_timemap(self):

        try:
            # get URI-T
            r = self.httpcache.get(self.urit)

            self.timemap = aiu.convert_LinkTimeMap_to_dict(r.text)

            memento_list = []

            for memento in self.timemap["mementos"]["list"]:
                mdt = memento['datetime']
                urim = memento['uri']
                memento_list.append((mdt, urim))

            self.sorted_mementos_list = sorted(memento_list)

        except (URLRequired, MissingSchema, InvalidSchema, InvalidURL) as e:
            raise InvalidTimeMapURI("", original_exception=e)

        except Timeout as e:
            raise TimeMapTimeoutError("", original_exception=e)

        except SSLError as e:
            raise TimeMapSSLError("", original_exception=e)

        except (UnrewindableBodyError, ConnectionError) as e:
            raise TimeMapConnectionFailure("", original_exception=e)
Beispiel #2
0
    def test_example_multiple_urits(self):

        lheader = '<http://ogp.me>; rel="original", <https://perma.cc/timegate/http://ogp.me>; rel="timegate", <https://perma.cc/timemap/link/http://ogp.me>; rel="timemap"; type="application/link-format", <https://perma.cc/timemap/json/http://ogp.me>; rel="timemap"; type="application/json", <https://perma.cc/timemap/html/http://ogp.me>; rel="timemap"; type="text/html", <https://perma.cc/7YXW-UFQ3>; rel="memento"; datetime="Sun, 04 Oct 2015 23:18:13 GMT"'

        actual_json_timemap = convert_LinkTimeMap_to_dict(lheader, debug=False)

        # print("output in test")
        # pp.pprint(actual_json_timemap)

        expected_json_timemap = {
            'mementos': {
                'list': [{
                    'datetime': datetime.datetime(2015, 10, 4, 23, 18, 13),
                    'uri': 'https://perma.cc/7YXW-UFQ3'
                }]
            },
            'original_uri': 'http://ogp.me',
            'timegate_uri': 'https://perma.cc/timegate/http://ogp.me',
            'timemap_uri': {
                'json_format': 'https://perma.cc/timemap/json/http://ogp.me',
                'link_format': 'https://perma.cc/timemap/link/http://ogp.me'
            }
        }

        # Note that the HTML timemap is not listed because http://mementoweb.org/guide/timemap-json/ does not specify it
        self.assertEqual(actual_json_timemap, expected_json_timemap)
Beispiel #3
0
    def test_example_multiple_urits_missing_quotes(self):

        lheader = '<http://ogp.me>; rel=original, <https://perma.cc/timegate/http://ogp.me>; rel=timegate, <https://perma.cc/timemap/link/http://ogp.me>; rel=timemap; type=application/link-format, <https://perma.cc/timemap/json/http://ogp.me>; rel=timemap; type=application/json, <https://perma.cc/timemap/html/http://ogp.me>; rel=timemap; type=text/html, <https://perma.cc/7YXW-UFQ3>; rel=memento; datetime="Sun, 04 Oct 2015 23:18:13 GMT"'

        actual_json_timemap = convert_LinkTimeMap_to_dict(lheader,
                                                          debug=False,
                                                          noquotes=True)

        # print("output in test")
        # pp.pprint(actual_json_timemap)

        expected_json_timemap = {
            'mementos': {
                'list': [{
                    'datetime': datetime.datetime(2015, 10, 4, 23, 18, 13),
                    'uri': 'https://perma.cc/7YXW-UFQ3'
                }]
            },
            'original_uri': 'http://ogp.me',
            'timegate_uri': 'https://perma.cc/timegate/http://ogp.me',
            'timemap_uri': {
                'json_format': 'https://perma.cc/timemap/json/http://ogp.me',
                'link_format': 'https://perma.cc/timemap/link/http://ogp.me'
            }
        }

        self.assertEqual(actual_json_timemap, expected_json_timemap)
Beispiel #4
0
    def test_good_link_headers(self):

        lheader = '<http://ogp.me:80/>; rel="original", <https://web.archive.org/web/timemap/link/http://ogp.me:80/>; rel="timemap"; type="application/link-format", <https://web.archive.org/web/http://ogp.me:80/>; rel="timegate", <https://web.archive.org/web/20100802055126/http://ogp.me:80/>; rel="first memento"; datetime="Mon, 02 Aug 2010 05:51:26 GMT", <https://web.archive.org/web/20100802055126/http://ogp.me:80/>; rel="memento"; datetime="Mon, 02 Aug 2010 05:51:26 GMT", <https://web.archive.org/web/20101211091635/http://ogp.me/>; rel="next memento"; datetime="Sat, 11 Dec 2010 09:16:35 GMT", <https://web.archive.org/web/20210106030214/https://ogp.me/>; rel="last memento"; datetime="Wed, 06 Jan 2021 03:02:14 GMT"'

        actual_json_timemap = convert_LinkTimeMap_to_dict(lheader)

        expected_json_timemap = {
            'mementos': {
                'first': {
                    'datetime':
                    datetime.datetime(2010, 8, 2, 5, 51, 26),
                    'uri':
                    'https://web.archive.org/web/20100802055126/http://ogp.me:80/'
                },
                'last': {
                    'datetime':
                    datetime.datetime(2021, 1, 6, 3, 2, 14),
                    'uri':
                    'https://web.archive.org/web/20210106030214/https://ogp.me/'
                },
                'list': [{
                    'datetime':
                    datetime.datetime(2010, 8, 2, 5, 51, 26),
                    'uri':
                    'https://web.archive.org/web/20100802055126/http://ogp.me:80/'
                }, {
                    'datetime':
                    datetime.datetime(2010, 8, 2, 5, 51, 26),
                    'uri':
                    'https://web.archive.org/web/20100802055126/http://ogp.me:80/'
                }, {
                    'datetime':
                    datetime.datetime(2010, 12, 11, 9, 16, 35),
                    'uri':
                    'https://web.archive.org/web/20101211091635/http://ogp.me/'
                }, {
                    'datetime':
                    datetime.datetime(2021, 1, 6, 3, 2, 14),
                    'uri':
                    'https://web.archive.org/web/20210106030214/https://ogp.me/'
                }]
            },
            'original_uri': 'http://ogp.me:80/',
            'timegate_uri': 'https://web.archive.org/web/http://ogp.me:80/',
            'timemap_uri': {
                "link_format":
                "https://web.archive.org/web/timemap/link/http://ogp.me:80/"
            }
        }

        # pp.pprint(actual_json_timemap)

        self.assertEqual(actual_json_timemap, expected_json_timemap)
Beispiel #5
0
def download_urits_and_extract_urims(uritlist, session):

    urimlist = []
    cpucount = multiprocessing.cpu_count()
    futuresesion = FuturesSession(session=session, max_workers=cpucount)
    futures = {}
    working_list = deepcopy(uritlist)

    for urit in uritlist:
        futures[urit] = futuresesion.get(urit)

    def urit_generator(workinglist):

        while len(workinglist) > 0:
            yield random.choice(workinglist)

    for workinguri in urit_generator(working_list):

        if futures[workinguri].done():

            try:
                r = futures[workinguri].result()
            except RequestException:
                pass

            if r.status_code == 200:
                timemap_content = convert_LinkTimeMap_to_dict(r.text)

                try:
                    urims = extract_urims_from_TimeMap(timemap_content)
                except KeyError as e:
                    module_logger.exception(
                        "Skipping TimeMap {}, encountered problem extracting URI-Ms from TimeMap: {}"
                        .format(workinguri, repr(e)))
                    hypercane.errors.errorstore.add(workinguri,
                                                    traceback.format_exc())

                urimlist.extend(urims)

            working_list.remove(workinguri)
            del futures[workinguri]

    return urimlist
Beispiel #6
0
 def getTimeMap(self, urit):
     """
         Returns the dict form of TimeMap at `urit` provided that it
         was previously stored via `addTimeMap`.
     """
     return convert_LinkTimeMap_to_dict(self.session.get(urit).text)
Beispiel #7
0
    def test_7089_fig28(self):

        timemap = """    <http://a.example.org>;rel="original",
    <http://arxiv.example.net/timemap/http://a.example.org>
      ; rel="self";type="application/link-format"
      ; from="Tue, 20 Jun 2000 18:02:59 GMT"
      ; until="Wed, 09 Apr 2008 20:30:51 GMT",
    <http://arxiv.example.net/timegate/http://a.example.org>
      ; rel="timegate",
    <http://arxiv.example.net/web/20000620180259/http://a.example.org>
      ; rel="first memento";datetime="Tue, 20 Jun 2000 18:02:59 GMT"
      ; license="http://creativecommons.org/publicdomain/zero/1.0/",
    <http://arxiv.example.net/web/20091027204954/http://a.example.org>
       ; rel="last memento";datetime="Tue, 27 Oct 2009 20:49:54 GMT"
       ; license="http://creativecommons.org/publicdomain/zero/1.0/",
    <http://arxiv.example.net/web/20000621011731/http://a.example.org>
      ; rel="memento";datetime="Wed, 21 Jun 2000 01:17:31 GMT"
      ; license="http://creativecommons.org/publicdomain/zero/1.0/",
    <http://arxiv.example.net/web/20000621044156/http://a.example.org>
      ; rel="memento";datetime="Wed, 21 Jun 2000 04:41:56 GMT"
      ; license="http://creativecommons.org/publicdomain/zero/1.0/"
      """

        actual_json_timemap = convert_LinkTimeMap_to_dict(timemap, debug=False)

        # print("output in test")
        # pp.pprint(actual_json_timemap)

        expected_json_timemap = {
            'mementos': {
                'first': {
                    'datetime':
                    datetime.datetime(2000, 6, 20, 18, 2, 59),
                    'uri':
                    'http://arxiv.example.net/web/20000620180259/http://a.example.org'
                },
                'last': {
                    'datetime':
                    datetime.datetime(2009, 10, 27, 20, 49, 54),
                    'uri':
                    'http://arxiv.example.net/web/20091027204954/http://a.example.org'
                },
                'list': [{
                    'datetime':
                    datetime.datetime(2000, 6, 20, 18, 2, 59),
                    'uri':
                    'http://arxiv.example.net/web/20000620180259/http://a.example.org'
                }, {
                    'datetime':
                    datetime.datetime(2009, 10, 27, 20, 49, 54),
                    'uri':
                    'http://arxiv.example.net/web/20091027204954/http://a.example.org'
                }, {
                    'datetime':
                    datetime.datetime(2000, 6, 21, 1, 17, 31),
                    'uri':
                    'http://arxiv.example.net/web/20000621011731/http://a.example.org'
                }, {
                    'datetime':
                    datetime.datetime(2000, 6, 21, 4, 41, 56),
                    'uri':
                    'http://arxiv.example.net/web/20000621044156/http://a.example.org'
                }]
            },
            'original_uri': 'http://a.example.org',
            'timegate_uri':
            'http://arxiv.example.net/timegate/http://a.example.org',
            'timemap_uri': {
                'link_format':
                'http://arxiv.example.net/timemap/http://a.example.org'
            }
        }

        self.assertEqual(actual_json_timemap, expected_json_timemap)
Beispiel #8
0
def process_timemaps_for_mementos(urit_list, session):

    timemap_data = {}
    errors_data = {}

    with FuturesSession(max_workers=cpu_count, session=session) as session:
        futures = get_uri_responses(session, urit_list)

    working_uri_list = list(futures.keys())

    for urit in list_generator(working_uri_list):

        module_logger.debug("checking if URI-T {} is done downloading".format(urit))

        if futures[urit].done():

            module_logger.debug("URI-T {} is done, extracting content".format(urit))

            try:
                response = futures[urit].result()

                http_status = response.status_code

                if http_status == 200:

                    timemap_content = response.text

                    module_logger.info("adding TimeMap content for URI-T {}".format(
                        urit))

                    timemap_data[urit] = convert_LinkTimeMap_to_dict(
                        timemap_content, skipErrors=True)

                else:

                    errors_data[urit] = {
                        "type": "http_error",
                        "data": response
                    }

                working_uri_list.remove(urit)

            except ConnectionError as e:

                module_logger.warning("There was a connection error while attempting "
                    "to download URI-T {}".format(urit))

                errors_data[urit] = {
                    "type": "exception",
                    "data": e
                }

                working_uri_list.remove(urit)

            except TooManyRedirects as e:

                module_logger.warning("There were too many redirects while attempting "
                    "to download URI-T {}".format(urit))

                errors_data[urit] = {
                    "type": "exception",
                    "data": e
                }

                working_uri_list.remove(urit)

    return timemap_data, errors_data
    urit = links['timemap']['url']

    entry['first_memento_dt'] = datetime.strptime(
        r.headers['memento-datetime'],
        '%a, %d %b %Y %H:%M:%S GMT').strftime('%B %d, %Y')

    try:
        timegate = links['timegate']['url']
    except KeyError as e:
        print("failed to find the URI-G for URI-M {}".format(entry['urim']))
        entries.append(entry)
        continue

    r = requests.get(urit)

    timemap = convert_LinkTimeMap_to_dict(r.text)

    try:
        entry['memento_count'] = len(timemap['mementos']['list'])
    except KeyError as e:
        print("failed to count mementos in TimeMap at {}".format(urit))
        entries.append(entry)
        continue

    try:
        mementos = timemap['mementos']['list']
    except KeyError as e:
        print("failed to find mementos in TimeMap at {}".format(urit))
        entries.append(entry)
        continue