def testInternetArchiveNewest(self): archivedversion = weblib.getInternetArchiveURL('https://google.com') parsed = urlparse(archivedversion) self.assertIn(parsed.scheme, [u'http', u'https']) self.assertEqual(parsed.netloc, u'web.archive.org') self.assertTrue( parsed.path.strip('/').endswith('www.google.com'), parsed.path)
def setLinkDead(self, url, error, page, weblink_dead_days): """Add the fact that the link was found dead to the .dat file.""" with self.semaphore: now = time.time() if url in self.historyDict: timeSinceFirstFound = now - self.historyDict[url][0][1] timeSinceLastFound = now - self.historyDict[url][-1][1] # if the last time we found this dead link is less than an hour # ago, we won't save it in the history this time. if timeSinceLastFound > 60 * 60: self.historyDict[url].append((page.title(), now, error)) # if the first time we found this link longer than x day ago # (default is a week), it should probably be fixed or removed. # We'll list it in a file so that it can be removed manually. if timeSinceFirstFound > 60 * 60 * 24 * weblink_dead_days: # search for archived page try: archiveURL = get_archive_url(url) except Exception as e: pywikibot.warning( 'get_closest_memento_url({0}) failed: {1}'.format( url, e)) archiveURL = None if archiveURL is None: archiveURL = weblib.getInternetArchiveURL(url) if archiveURL is None: archiveURL = weblib.getWebCitationURL(url) self.log(url, error, page, archiveURL) else: self.historyDict[url] = [(page.title(), now, error)]
def setLinkDead(self, url, error, page, day): """ Adds the fact that the link was found dead to the .dat file. """ self.semaphore.acquire() now = time.time() if url in self.historyDict: timeSinceFirstFound = now - self.historyDict[url][0][1] timeSinceLastFound = now - self.historyDict[url][-1][1] # if the last time we found this dead link is less than an hour # ago, we won't save it in the history this time. if timeSinceLastFound > 60 * 60: self.historyDict[url].append((page.title(), now, error)) # if the first time we found this link longer than x day ago # (default is a week), it should probably be fixed or removed. # We'll list it in a file so that it can be removed manually. if timeSinceFirstFound > 60 * 60 * 24 * day: # search for archived page archiveURL = weblib.getInternetArchiveURL(url) if archiveURL is None: archiveURL = weblib.getWebCitationURL(url) self.log(url, error, page, archiveURL) else: self.historyDict[url] = [(page.title(), now, error)] self.semaphore.release()
def testInternetArchiveOlder(self): archivedversion = weblib.getInternetArchiveURL("https://google.com", "200606") parsed = urlparse(archivedversion) self.assertIn(parsed.scheme, ["http", "https"]) self.assertEqual(parsed.netloc, "web.archive.org") self.assertTrue(parsed.path.strip("/").endswith("www.google.com"), parsed.path) self.assertIn("200606", parsed.path)
def setLinkDead(self, url, error, page, day): """Add the fact that the link was found dead to the .dat file.""" self.semaphore.acquire() now = time.time() if url in self.historyDict: timeSinceFirstFound = now - self.historyDict[url][0][1] timeSinceLastFound = now - self.historyDict[url][-1][1] # if the last time we found this dead link is less than an hour # ago, we won't save it in the history this time. if timeSinceLastFound > 60 * 60: self.historyDict[url].append((page.title(), now, error)) # if the first time we found this link longer than x day ago # (default is a week), it should probably be fixed or removed. # We'll list it in a file so that it can be removed manually. if timeSinceFirstFound > 60 * 60 * 24 * day: # search for archived page try: archiveURL = get_archive_url(url) except Exception as e: pywikibot.warning( 'get_closest_memento_url({0}) failed: {1}'.format( url, e)) archiveURL = None if archiveURL is None: archiveURL = weblib.getInternetArchiveURL(url) if archiveURL is None: archiveURL = weblib.getWebCitationURL(url) self.log(url, error, page, archiveURL) else: self.historyDict[url] = [(page.title(), now, error)] self.semaphore.release()
def testInternetArchiveOlder(self): archivedversion = weblib.getInternetArchiveURL('https://google.com', '200606') parsed = urlparse(archivedversion) self.assertIn(parsed.scheme, [u'http', u'https']) self.assertEqual(parsed.netloc, u'web.archive.org') self.assertTrue(parsed.path.strip('/').endswith('www.google.com'), parsed.path) self.assertIn('200606', parsed.path)
def testInternetArchiveNewest(self): with PatchedHttp(weblib, False) as p: p.after_fetch = self._test_response archivedversion = weblib.getInternetArchiveURL('https://google.com') parsed = urlparse(archivedversion) self.assertIn(parsed.scheme, [u'http', u'https']) self.assertEqual(parsed.netloc, u'web.archive.org') self.assertTrue(parsed.path.strip('/').endswith('www.google.com'), parsed.path)
def _get_archive_url(self, url, date_string=None): with PatchedHttp(weblib, False) as p: p.after_fetch = self._test_response try: archivedversion = weblib.getInternetArchiveURL( url, date_string) except RequestsConnectionError as e: self.skipTest(e) self.assertOneDeprecation() return archivedversion
def _get_archive_url(self, url, date_string=None): with PatchedHttp(weblib, False) as p: p.after_fetch = self._test_response archivedversion = weblib.getInternetArchiveURL(url, date_string) self.assertOneDeprecation() return archivedversion