Example #1
0
 def test_open_archive_directory(self):
     with self.assertRaises(ValueError):
         storytracker.open_archive_directory("./foo.bar")
     storytracker.archive(self.url, compress=False, output_dir=self.tmpdir)
     storytracker.archive(self.url, compress=False, output_dir=self.tmpdir)
     urlset = storytracker.open_archive_directory(self.tmpdir)
     self.assertTrue(len(urlset), 2)
     [self.assertTrue(isinstance(o, ArchivedURL)) for o in urlset]
Example #2
0
 def test_open_archive_directory(self):
     with self.assertRaises(ValueError):
         storytracker.open_archive_directory("./foo.bar")
     storytracker.archive(self.url, compress=False, output_dir=self.tmpdir)
     storytracker.archive(self.url, compress=False, output_dir=self.tmpdir)
     urlset = storytracker.open_archive_directory(self.tmpdir)
     self.assertTrue(len(urlset), 2)
     [self.assertTrue(isinstance(o, ArchivedURL)) for o in urlset]
Example #3
0
def result():
	if request.method == 'POST':
	    u = request.form['url']
	    link_type = request.form['link_type']
	    url = storytracker.archive(u)
	    links = [h for h in url.hyperlinks if link_type in h.href]
	else:
		u = request.args.get('url', '')
		link_type = request.args.get('link_type','')
		url = storytracker.archive(u)
		links = [h for h in url.hyperlinks if link_type in h.href]
	return render_template('results.html', links=links, num_links = len(links), url=u, link_type=link_type)
Example #4
0
 def test_archive(self):
     self.archive = storytracker.archive(self.url)
     obj1 = self.archive
     obj2 = storytracker.archive(self.url, minify=False)
     obj3 = storytracker.archive(self.url, extend_urls=False)
     obj4 = storytracker.archive(self.url, output_dir=self.tmpdir)
     obj5 = storytracker.archive(self.url, compress=False, output_dir=self.tmpdir)
     for obj in [obj1, obj2, obj3, obj4, obj5]:
         self.assertTrue(isinstance(obj, storytracker.ArchivedURL))
     self.assertTrue(os.path.exists(obj4.gzip_archive_path))
     self.assertTrue(os.path.exists(obj5.html_archive_path))
     os.remove(obj4.gzip_archive_path)
     os.remove(obj5.html_archive_path)
Example #5
0
 def test_archive(self):
     self.archive = storytracker.archive(self.url)
     obj1 = self.archive
     obj2 = storytracker.archive(self.url, minify=False)
     obj3 = storytracker.archive(self.url, extend_urls=False)
     obj4 = storytracker.archive(self.url, output_dir=self.tmpdir)
     obj5 = storytracker.archive(self.url,
                                 compress=False,
                                 output_dir=self.tmpdir)
     for obj in [obj1, obj2, obj3, obj4, obj5]:
         self.assertTrue(isinstance(obj, storytracker.ArchivedURL))
     self.assertTrue(os.path.exists(obj4.gzip_archive_path))
     self.assertTrue(os.path.exists(obj5.html_archive_path))
     os.remove(obj4.gzip_archive_path)
     os.remove(obj5.html_archive_path)
Example #6
0
 def test_analyze(self):
     self.archive = storytracker.archive(self.url)
     self.assertEqual(self.archive._hyperlinks, [])
     self.assertEqual(self.archive._images, [])
     self.archive.analyze()
     self.assertTrue(isinstance(self.archive._hyperlinks[0], storytracker.Hyperlink))
     self.assertTrue(isinstance(self.archive._images[0], storytracker.Image))
Example #7
0
def result():
    if request.method == 'POST':
        u = request.form['url']
        link_type = request.form['link_type']
        url = storytracker.archive(u)
        links = [h for h in url.hyperlinks if link_type in h.href]
    else:
        u = request.args.get('url', '')
        link_type = request.args.get('link_type', '')
        url = storytracker.archive(u)
        links = [h for h in url.hyperlinks if link_type in h.href]
    return render_template('results.html',
                           links=links,
                           num_links=len(links),
                           url=u,
                           link_type=link_type)
Example #8
0
 def test_analyze(self):
     self.archive = storytracker.archive(self.url)
     self.assertEqual(self.archive._hyperlinks, [])
     self.assertEqual(self.archive._images, [])
     self.archive.analyze()
     self.assertTrue(
         isinstance(self.archive._hyperlinks[0], storytracker.Hyperlink))
     self.assertTrue(isinstance(self.archive._images[0],
                                storytracker.Image))
Example #9
0
def open_wayback_machine_url(url, **kwargs):
    """
    Accepts an URL from the Internet Archive's Wayback Machine
    and returns an ArchivedURL object
    """
    # Extract the URL and timestamp from the url
    archive_url, timestamp = storytracker.reverse_wayback_machine_url(url)
    # Modify the standard Wayback Machine URL to be one that returns the raw
    # HTML without any of the chrome and navigation tools inserted by
    # the archive
    if "id_" not in url:
        url = url.replace("/%s" % archive_url, "id_/%s" % archive_url)
    # Retrieve the raw HTML
    html = storytracker.archive(url, **kwargs).html
    # Pass it all back
    return ArchivedURL(archive_url, timestamp, html)
def open_wayback_machine_url(url, **kwargs):
    """
    Accepts an URL from the Internet Archive's Wayback Machine
    and returns an ArchivedURL object
    """
    # Extract the URL and timestamp from the url
    archive_url, timestamp = storytracker.reverse_wayback_machine_url(url)
    # Modify the standard Wayback Machine URL to be one that returns the raw
    # HTML without any of the chrome and navigation tools inserted by
    # the archive
    if "id_" not in url:
        url = url.replace(
            "/%s" % archive_url,
            "id_/%s" % archive_url
        )
    # Retrieve the raw HTML
    html = storytracker.archive(url, **kwargs).html
    # Pass it all back
    return ArchivedURL(archive_url, timestamp, html)
Example #11
0
 def test_open_archive_html(self):
     obj1 = storytracker.archive(self.url,
                                 output_dir=self.tmpdir,
                                 compress=False)
     obj2 = storytracker.open_archive_filepath(obj1.html_archive_path)
     self.assertEqual(obj1, obj2)
Example #12
0
    def test_url(self):
        self.archive = storytracker.archive(self.url)
        obj = self.archive

        # Metadata
        self.assertEqual(self.url, obj.url)
        obj.timestamp
        obj.html
        obj.gzip
        obj.__unicode__()
        obj.__str__()
        obj.__repr__()

        # Browser
        self.assertEqual(self.archive._browser, None)
        self.archive.get_browser()
        self.assertTrue(isinstance(self.archive._browser, webdriver.PhantomJS))
        self.archive.close_browser()
        self.assertEqual(self.archive._browser, None)
        self.archive.close_browser()

        # Gzip
        self.assertEqual(obj.gzip_archive_path, None)
        obj.write_gzip_to_directory(self.tmpdir)

        # Hyperlinks
        self.assertEqual(obj._hyperlinks, [])
        self.assertTrue(isinstance(obj.hyperlinks, list))
        self.assertEqual(obj._hyperlinks, obj.hyperlinks)
        [self.assertTrue(isinstance(a, Hyperlink)) for a in obj.hyperlinks]
        a = obj.hyperlinks[0]
        a.href
        a.string
        a.domain
        a.index
        if a.images:
            for i in a.images:
                self.assertTrue(isinstance(i, Image))
                i.src
                i.__unicode__()
        a.__unicode__()
        a.__str__()
        a.__repr__()
        a.__csv__()

        # Hyperlinks to CSV
        f = six.StringIO()
        f = obj.write_hyperlinks_csv_to_file(f)
        p = os.path.join(self.tmpdir, "links.csv")
        f2 = open(p, "w+")
        f2 = obj.write_hyperlinks_csv_to_file(f2)
        self.assertTrue(os.path.exists(p))
        os.remove(p)

        # Images
        self.assertEqual(obj._images, [])
        self.assertTrue(len(obj.images) > 0)
        self.assertTrue(isinstance(obj.images, list))
        self.assertEqual(obj._images, obj.images)
        [self.assertTrue(isinstance(i, Image)) for i in obj.images]
        img = obj.images[0]
        img.src
        img.__unicode__()
        img.__str__()
        img.__repr__()
Example #13
0
 def test_open_archive_html(self):
     obj1 = storytracker.archive(self.url, output_dir=self.tmpdir, compress=False)
     obj2 = storytracker.open_archive_filepath(obj1.html_archive_path)
     self.assertEqual(obj1, obj2)
Example #14
0
        'http://example.com',
        datetime(2014, 1, 1, 2, 0, 0),
        open("./example/c.html", "rb").read()
    ),
    storytracker.ArchivedURL(
        'http://example.com',
        datetime(2014, 1, 1, 3, 0, 0),
        open("./example/d.html", "rb").read()
    ),
])

# Run through all the pages
urlset.analyze()

## URL images
obj = storytracker.archive("http://www.cnn.com/")
illo_path = obj.write_illustration_to_directory("./docs/_static/example/")
overlay_path = obj.write_overlay_to_directory("./docs/_static/example/")
os.rename(illo_path, "./docs/_static/example/illo.jpg")
os.rename(overlay_path, "./docs/_static/example/overlay.png")

# URL images
urlset2 = storytracker.ArchivedURLSet([
    storytracker.open_wayback_machine_url("https://web.archive.org/web/20140101005148/http://www.bbc.co.uk/news/"),
    storytracker.open_wayback_machine_url("https://web.archive.org/web/20140101080323/http://www.bbc.co.uk/news/"),
    storytracker.open_wayback_machine_url("https://web.archive.org/web/20140101094432/http://www.bbc.co.uk/news/"),
])
urlset2[0].write_overlay_to_directory("./")
gif_path = urlset2.write_href_overlay_animation_to_directory(
    "https://web.archive.org/news/world-africa-25561753",
    "./docs/_static/example/"
Example #15
0
    storytracker.ArchivedURL('http://example.com',
                             datetime(2014, 1, 1, 1, 0, 0),
                             open("./example/b.html", "rb").read()),
    storytracker.ArchivedURL('http://example.com',
                             datetime(2014, 1, 1, 2, 0, 0),
                             open("./example/c.html", "rb").read()),
    storytracker.ArchivedURL('http://example.com',
                             datetime(2014, 1, 1, 3, 0, 0),
                             open("./example/d.html", "rb").read()),
])

# Run through all the pages
urlset.analyze()

## URL images
obj = storytracker.archive("http://www.cnn.com/")
illo_path = obj.write_illustration_to_directory("./docs/_static/example/")
overlay_path = obj.write_overlay_to_directory("./docs/_static/example/")
os.rename(illo_path, "./docs/_static/example/illo.jpg")
os.rename(overlay_path, "./docs/_static/example/overlay.png")

# URL images
urlset2 = storytracker.ArchivedURLSet([
    storytracker.open_wayback_machine_url(
        "https://web.archive.org/web/20140101005148/http://www.bbc.co.uk/news/"
    ),
    storytracker.open_wayback_machine_url(
        "https://web.archive.org/web/20140101080323/http://www.bbc.co.uk/news/"
    ),
    storytracker.open_wayback_machine_url(
        "https://web.archive.org/web/20140101094432/http://www.bbc.co.uk/news/"
Example #16
0
 def test_open_archive_gzip(self):
     obj1 = storytracker.archive(self.url, output_dir=self.tmpdir)
     obj2 = storytracker.open_archive_filepath(obj1.gzip_archive_path)
     self.assertEqual(obj1, obj2)
Example #17
0
 def test_open_archive_gzip(self):
     obj1 = storytracker.archive(self.url, output_dir=self.tmpdir)
     obj2 = storytracker.open_archive_filepath(obj1.gzip_archive_path)
     self.assertEqual(obj1, obj2)
Example #18
0
    def test_url(self):
        self.archive = storytracker.archive(self.url)
        obj = self.archive

        # Metadata
        self.assertEqual(self.url, obj.url)
        obj.timestamp
        obj.html
        obj.gzip
        obj.__unicode__()
        obj.__str__()
        obj.__repr__()

        # Browser
        self.assertEqual(self.archive._browser, None)
        self.archive.get_browser()
        self.assertTrue(isinstance(self.archive._browser, webdriver.PhantomJS))
        self.archive.close_browser()
        self.assertEqual(self.archive._browser, None)
        self.archive.close_browser()

        # Gzip
        self.assertEqual(obj.gzip_archive_path, None)
        obj.write_gzip_to_directory(self.tmpdir)

        # Hyperlinks
        self.assertEqual(obj._hyperlinks, [])
        self.assertTrue(isinstance(obj.hyperlinks, list))
        self.assertEqual(obj._hyperlinks, obj.hyperlinks)
        [self.assertTrue(isinstance(a, Hyperlink)) for a in obj.hyperlinks]
        a = obj.hyperlinks[0]
        a.href
        a.string
        a.domain
        a.index
        if a.images:
            for i in a.images:
                self.assertTrue(isinstance(i, Image))
                i.src
                i.__unicode__()
        a.__unicode__()
        a.__str__()
        a.__repr__()
        a.__csv__()

        # Hyperlinks to CSV
        f = six.StringIO()
        f = obj.write_hyperlinks_csv_to_file(f)
        p = os.path.join(self.tmpdir, 'links.csv')
        f2 = open(p, 'w+')
        f2 = obj.write_hyperlinks_csv_to_file(f2)
        self.assertTrue(os.path.exists(p))
        os.remove(p)

        # Images
        self.assertEqual(obj._images, [])
        self.assertTrue(len(obj.images) > 0)
        self.assertTrue(isinstance(obj.images, list))
        self.assertEqual(obj._images, obj.images)
        [self.assertTrue(isinstance(i, Image)) for i in obj.images]
        img = obj.images[0]
        img.src
        img.__unicode__()
        img.__str__()
        img.__repr__()