def test_open_archive_directory(self): with self.assertRaises(ValueError): storytracker.open_archive_directory("./foo.bar") storytracker.archive(self.url, compress=False, output_dir=self.tmpdir) storytracker.archive(self.url, compress=False, output_dir=self.tmpdir) urlset = storytracker.open_archive_directory(self.tmpdir) self.assertTrue(len(urlset), 2) [self.assertTrue(isinstance(o, ArchivedURL)) for o in urlset]
def result(): if request.method == 'POST': u = request.form['url'] link_type = request.form['link_type'] url = storytracker.archive(u) links = [h for h in url.hyperlinks if link_type in h.href] else: u = request.args.get('url', '') link_type = request.args.get('link_type','') url = storytracker.archive(u) links = [h for h in url.hyperlinks if link_type in h.href] return render_template('results.html', links=links, num_links = len(links), url=u, link_type=link_type)
def test_archive(self): self.archive = storytracker.archive(self.url) obj1 = self.archive obj2 = storytracker.archive(self.url, minify=False) obj3 = storytracker.archive(self.url, extend_urls=False) obj4 = storytracker.archive(self.url, output_dir=self.tmpdir) obj5 = storytracker.archive(self.url, compress=False, output_dir=self.tmpdir) for obj in [obj1, obj2, obj3, obj4, obj5]: self.assertTrue(isinstance(obj, storytracker.ArchivedURL)) self.assertTrue(os.path.exists(obj4.gzip_archive_path)) self.assertTrue(os.path.exists(obj5.html_archive_path)) os.remove(obj4.gzip_archive_path) os.remove(obj5.html_archive_path)
def test_analyze(self): self.archive = storytracker.archive(self.url) self.assertEqual(self.archive._hyperlinks, []) self.assertEqual(self.archive._images, []) self.archive.analyze() self.assertTrue(isinstance(self.archive._hyperlinks[0], storytracker.Hyperlink)) self.assertTrue(isinstance(self.archive._images[0], storytracker.Image))
def result(): if request.method == 'POST': u = request.form['url'] link_type = request.form['link_type'] url = storytracker.archive(u) links = [h for h in url.hyperlinks if link_type in h.href] else: u = request.args.get('url', '') link_type = request.args.get('link_type', '') url = storytracker.archive(u) links = [h for h in url.hyperlinks if link_type in h.href] return render_template('results.html', links=links, num_links=len(links), url=u, link_type=link_type)
def test_analyze(self): self.archive = storytracker.archive(self.url) self.assertEqual(self.archive._hyperlinks, []) self.assertEqual(self.archive._images, []) self.archive.analyze() self.assertTrue( isinstance(self.archive._hyperlinks[0], storytracker.Hyperlink)) self.assertTrue(isinstance(self.archive._images[0], storytracker.Image))
def open_wayback_machine_url(url, **kwargs): """ Accepts an URL from the Internet Archive's Wayback Machine and returns an ArchivedURL object """ # Extract the URL and timestamp from the url archive_url, timestamp = storytracker.reverse_wayback_machine_url(url) # Modify the standard Wayback Machine URL to be one that returns the raw # HTML without any of the chrome and navigation tools inserted by # the archive if "id_" not in url: url = url.replace("/%s" % archive_url, "id_/%s" % archive_url) # Retrieve the raw HTML html = storytracker.archive(url, **kwargs).html # Pass it all back return ArchivedURL(archive_url, timestamp, html)
def open_wayback_machine_url(url, **kwargs): """ Accepts an URL from the Internet Archive's Wayback Machine and returns an ArchivedURL object """ # Extract the URL and timestamp from the url archive_url, timestamp = storytracker.reverse_wayback_machine_url(url) # Modify the standard Wayback Machine URL to be one that returns the raw # HTML without any of the chrome and navigation tools inserted by # the archive if "id_" not in url: url = url.replace( "/%s" % archive_url, "id_/%s" % archive_url ) # Retrieve the raw HTML html = storytracker.archive(url, **kwargs).html # Pass it all back return ArchivedURL(archive_url, timestamp, html)
def test_open_archive_html(self): obj1 = storytracker.archive(self.url, output_dir=self.tmpdir, compress=False) obj2 = storytracker.open_archive_filepath(obj1.html_archive_path) self.assertEqual(obj1, obj2)
def test_url(self): self.archive = storytracker.archive(self.url) obj = self.archive # Metadata self.assertEqual(self.url, obj.url) obj.timestamp obj.html obj.gzip obj.__unicode__() obj.__str__() obj.__repr__() # Browser self.assertEqual(self.archive._browser, None) self.archive.get_browser() self.assertTrue(isinstance(self.archive._browser, webdriver.PhantomJS)) self.archive.close_browser() self.assertEqual(self.archive._browser, None) self.archive.close_browser() # Gzip self.assertEqual(obj.gzip_archive_path, None) obj.write_gzip_to_directory(self.tmpdir) # Hyperlinks self.assertEqual(obj._hyperlinks, []) self.assertTrue(isinstance(obj.hyperlinks, list)) self.assertEqual(obj._hyperlinks, obj.hyperlinks) [self.assertTrue(isinstance(a, Hyperlink)) for a in obj.hyperlinks] a = obj.hyperlinks[0] a.href a.string a.domain a.index if a.images: for i in a.images: self.assertTrue(isinstance(i, Image)) i.src i.__unicode__() a.__unicode__() a.__str__() a.__repr__() a.__csv__() # Hyperlinks to CSV f = six.StringIO() f = obj.write_hyperlinks_csv_to_file(f) p = os.path.join(self.tmpdir, "links.csv") f2 = open(p, "w+") f2 = obj.write_hyperlinks_csv_to_file(f2) self.assertTrue(os.path.exists(p)) os.remove(p) # Images self.assertEqual(obj._images, []) self.assertTrue(len(obj.images) > 0) self.assertTrue(isinstance(obj.images, list)) self.assertEqual(obj._images, obj.images) [self.assertTrue(isinstance(i, Image)) for i in obj.images] img = obj.images[0] img.src img.__unicode__() img.__str__() img.__repr__()
'http://example.com', datetime(2014, 1, 1, 2, 0, 0), open("./example/c.html", "rb").read() ), storytracker.ArchivedURL( 'http://example.com', datetime(2014, 1, 1, 3, 0, 0), open("./example/d.html", "rb").read() ), ]) # Run through all the pages urlset.analyze() ## URL images obj = storytracker.archive("http://www.cnn.com/") illo_path = obj.write_illustration_to_directory("./docs/_static/example/") overlay_path = obj.write_overlay_to_directory("./docs/_static/example/") os.rename(illo_path, "./docs/_static/example/illo.jpg") os.rename(overlay_path, "./docs/_static/example/overlay.png") # URL images urlset2 = storytracker.ArchivedURLSet([ storytracker.open_wayback_machine_url("https://web.archive.org/web/20140101005148/http://www.bbc.co.uk/news/"), storytracker.open_wayback_machine_url("https://web.archive.org/web/20140101080323/http://www.bbc.co.uk/news/"), storytracker.open_wayback_machine_url("https://web.archive.org/web/20140101094432/http://www.bbc.co.uk/news/"), ]) urlset2[0].write_overlay_to_directory("./") gif_path = urlset2.write_href_overlay_animation_to_directory( "https://web.archive.org/news/world-africa-25561753", "./docs/_static/example/"
storytracker.ArchivedURL('http://example.com', datetime(2014, 1, 1, 1, 0, 0), open("./example/b.html", "rb").read()), storytracker.ArchivedURL('http://example.com', datetime(2014, 1, 1, 2, 0, 0), open("./example/c.html", "rb").read()), storytracker.ArchivedURL('http://example.com', datetime(2014, 1, 1, 3, 0, 0), open("./example/d.html", "rb").read()), ]) # Run through all the pages urlset.analyze() ## URL images obj = storytracker.archive("http://www.cnn.com/") illo_path = obj.write_illustration_to_directory("./docs/_static/example/") overlay_path = obj.write_overlay_to_directory("./docs/_static/example/") os.rename(illo_path, "./docs/_static/example/illo.jpg") os.rename(overlay_path, "./docs/_static/example/overlay.png") # URL images urlset2 = storytracker.ArchivedURLSet([ storytracker.open_wayback_machine_url( "https://web.archive.org/web/20140101005148/http://www.bbc.co.uk/news/" ), storytracker.open_wayback_machine_url( "https://web.archive.org/web/20140101080323/http://www.bbc.co.uk/news/" ), storytracker.open_wayback_machine_url( "https://web.archive.org/web/20140101094432/http://www.bbc.co.uk/news/"
def test_open_archive_gzip(self): obj1 = storytracker.archive(self.url, output_dir=self.tmpdir) obj2 = storytracker.open_archive_filepath(obj1.gzip_archive_path) self.assertEqual(obj1, obj2)
def test_url(self): self.archive = storytracker.archive(self.url) obj = self.archive # Metadata self.assertEqual(self.url, obj.url) obj.timestamp obj.html obj.gzip obj.__unicode__() obj.__str__() obj.__repr__() # Browser self.assertEqual(self.archive._browser, None) self.archive.get_browser() self.assertTrue(isinstance(self.archive._browser, webdriver.PhantomJS)) self.archive.close_browser() self.assertEqual(self.archive._browser, None) self.archive.close_browser() # Gzip self.assertEqual(obj.gzip_archive_path, None) obj.write_gzip_to_directory(self.tmpdir) # Hyperlinks self.assertEqual(obj._hyperlinks, []) self.assertTrue(isinstance(obj.hyperlinks, list)) self.assertEqual(obj._hyperlinks, obj.hyperlinks) [self.assertTrue(isinstance(a, Hyperlink)) for a in obj.hyperlinks] a = obj.hyperlinks[0] a.href a.string a.domain a.index if a.images: for i in a.images: self.assertTrue(isinstance(i, Image)) i.src i.__unicode__() a.__unicode__() a.__str__() a.__repr__() a.__csv__() # Hyperlinks to CSV f = six.StringIO() f = obj.write_hyperlinks_csv_to_file(f) p = os.path.join(self.tmpdir, 'links.csv') f2 = open(p, 'w+') f2 = obj.write_hyperlinks_csv_to_file(f2) self.assertTrue(os.path.exists(p)) os.remove(p) # Images self.assertEqual(obj._images, []) self.assertTrue(len(obj.images) > 0) self.assertTrue(isinstance(obj.images, list)) self.assertEqual(obj._images, obj.images) [self.assertTrue(isinstance(i, Image)) for i in obj.images] img = obj.images[0] img.src img.__unicode__() img.__str__() img.__repr__()