Beispiel #1
0
 def testInternetArchiveNewest(self):
     archivedversion = weblib.getInternetArchiveURL('https://google.com')
     parsed = urlparse(archivedversion)
     self.assertIn(parsed.scheme, [u'http', u'https'])
     self.assertEqual(parsed.netloc, u'web.archive.org')
     self.assertTrue(
         parsed.path.strip('/').endswith('www.google.com'), parsed.path)
Beispiel #2
0
 def setLinkDead(self, url, error, page, weblink_dead_days):
     """Add the fact that the link was found dead to the .dat file."""
     with self.semaphore:
         now = time.time()
         if url in self.historyDict:
             timeSinceFirstFound = now - self.historyDict[url][0][1]
             timeSinceLastFound = now - self.historyDict[url][-1][1]
             # if the last time we found this dead link is less than an hour
             # ago, we won't save it in the history this time.
             if timeSinceLastFound > 60 * 60:
                 self.historyDict[url].append((page.title(), now, error))
             # if the first time we found this link longer than x day ago
             # (default is a week), it should probably be fixed or removed.
             # We'll list it in a file so that it can be removed manually.
             if timeSinceFirstFound > 60 * 60 * 24 * weblink_dead_days:
                 # search for archived page
                 try:
                     archiveURL = get_archive_url(url)
                 except Exception as e:
                     pywikibot.warning(
                         'get_closest_memento_url({0}) failed: {1}'.format(
                             url, e))
                     archiveURL = None
                 if archiveURL is None:
                     archiveURL = weblib.getInternetArchiveURL(url)
                 if archiveURL is None:
                     archiveURL = weblib.getWebCitationURL(url)
                 self.log(url, error, page, archiveURL)
         else:
             self.historyDict[url] = [(page.title(), now, error)]
Beispiel #3
0
 def setLinkDead(self, url, error, page, day):
     """
     Adds the fact that the link was found dead to the .dat file.
     """
     self.semaphore.acquire()
     now = time.time()
     if url in self.historyDict:
         timeSinceFirstFound = now - self.historyDict[url][0][1]
         timeSinceLastFound = now - self.historyDict[url][-1][1]
         # if the last time we found this dead link is less than an hour
         # ago, we won't save it in the history this time.
         if timeSinceLastFound > 60 * 60:
             self.historyDict[url].append((page.title(), now, error))
         # if the first time we found this link longer than x day ago
         # (default is a week), it should probably be fixed or removed.
         # We'll list it in a file so that it can be removed manually.
         if timeSinceFirstFound > 60 * 60 * 24 * day:
             # search for archived page
             archiveURL = weblib.getInternetArchiveURL(url)
             if archiveURL is None:
                 archiveURL = weblib.getWebCitationURL(url)
             self.log(url, error, page, archiveURL)
     else:
         self.historyDict[url] = [(page.title(), now, error)]
     self.semaphore.release()
 def testInternetArchiveOlder(self):
     archivedversion = weblib.getInternetArchiveURL("https://google.com", "200606")
     parsed = urlparse(archivedversion)
     self.assertIn(parsed.scheme, ["http", "https"])
     self.assertEqual(parsed.netloc, "web.archive.org")
     self.assertTrue(parsed.path.strip("/").endswith("www.google.com"), parsed.path)
     self.assertIn("200606", parsed.path)
Beispiel #5
0
 def setLinkDead(self, url, error, page, day):
     """Add the fact that the link was found dead to the .dat file."""
     self.semaphore.acquire()
     now = time.time()
     if url in self.historyDict:
         timeSinceFirstFound = now - self.historyDict[url][0][1]
         timeSinceLastFound = now - self.historyDict[url][-1][1]
         # if the last time we found this dead link is less than an hour
         # ago, we won't save it in the history this time.
         if timeSinceLastFound > 60 * 60:
             self.historyDict[url].append((page.title(), now, error))
         # if the first time we found this link longer than x day ago
         # (default is a week), it should probably be fixed or removed.
         # We'll list it in a file so that it can be removed manually.
         if timeSinceFirstFound > 60 * 60 * 24 * day:
             # search for archived page
             try:
                 archiveURL = get_archive_url(url)
             except Exception as e:
                 pywikibot.warning(
                     'get_closest_memento_url({0}) failed: {1}'.format(
                         url, e))
                 archiveURL = None
             if archiveURL is None:
                 archiveURL = weblib.getInternetArchiveURL(url)
             if archiveURL is None:
                 archiveURL = weblib.getWebCitationURL(url)
             self.log(url, error, page, archiveURL)
     else:
         self.historyDict[url] = [(page.title(), now, error)]
     self.semaphore.release()
Beispiel #6
0
 def testInternetArchiveOlder(self):
     archivedversion = weblib.getInternetArchiveURL('https://google.com', '200606')
     parsed = urlparse(archivedversion)
     self.assertIn(parsed.scheme, [u'http', u'https'])
     self.assertEqual(parsed.netloc, u'web.archive.org')
     self.assertTrue(parsed.path.strip('/').endswith('www.google.com'), parsed.path)
     self.assertIn('200606', parsed.path)
Beispiel #7
0
 def testInternetArchiveNewest(self):
     with PatchedHttp(weblib, False) as p:
         p.after_fetch = self._test_response
         archivedversion = weblib.getInternetArchiveURL('https://google.com')
     parsed = urlparse(archivedversion)
     self.assertIn(parsed.scheme, [u'http', u'https'])
     self.assertEqual(parsed.netloc, u'web.archive.org')
     self.assertTrue(parsed.path.strip('/').endswith('www.google.com'), parsed.path)
 def _get_archive_url(self, url, date_string=None):
     with PatchedHttp(weblib, False) as p:
         p.after_fetch = self._test_response
         try:
             archivedversion = weblib.getInternetArchiveURL(
                 url, date_string)
         except RequestsConnectionError as e:
             self.skipTest(e)
         self.assertOneDeprecation()
         return archivedversion
 def _get_archive_url(self, url, date_string=None):
     with PatchedHttp(weblib, False) as p:
         p.after_fetch = self._test_response
         try:
             archivedversion = weblib.getInternetArchiveURL(
                 url, date_string)
         except RequestsConnectionError as e:
             self.skipTest(e)
         self.assertOneDeprecation()
         return archivedversion
 def _get_archive_url(self, url, date_string=None):
     with PatchedHttp(weblib, False) as p:
         p.after_fetch = self._test_response
         archivedversion = weblib.getInternetArchiveURL(url, date_string)
         self.assertOneDeprecation()
         return archivedversion
Beispiel #11
0
 def _get_archive_url(self, url, date_string=None):
     with PatchedHttp(weblib, False) as p:
         p.after_fetch = self._test_response
         archivedversion = weblib.getInternetArchiveURL(url, date_string)
         self.assertOneDeprecation()
         return archivedversion