Example #1
0
def archive(
    url, verify=True, minify=True, extend_urls=True, compress=True,
    output_dir=None
        ):
    """
    Archive the HTML from the provided URL
    """
    logger.debug("Archiving URL: %s" % url)

    # Get the html
    now = datetime.utcnow()
    now = now.replace(tzinfo=pytz.utc)
    html = storytracker.get(url, verify=verify)

    # Minify the html (but option to skip)
    if minify:
        html = htmlmin.minify(html)

    # Replace all relative URLs with absolute URLs, if called for
    if extend_urls:
        soup = BeautifulSoup(html)
        for target in COMMON_HYPERLINK_LOCATIONS:
            for hit in soup.findAll(*target['tag']):
                hit[target['attr']] = urljoin(url, hit[target['attr']])
        html = six.text_type(soup)

    # Create an URLArchive object
    obj = storytracker.ArchivedURL(url, now, html)

    # If a custom output dir is provided put everything in there
    if output_dir:
        logger.debug("Writing file to %s" % output_dir)
        if compress:
            obj.write_gzip_to_directory(output_dir)
        else:
            obj.write_html_to_directory(output_dir)

    # Return ArchivedURL object
    return obj
Example #2
0
 def test_get(self):
     storytracker.get(self.url)
     with self.assertRaises(ValueError):
         storytracker.get(self.img)
     storytracker.get(self.img, verify=False)
Example #3
0
 def test_get(self):
     storytracker.get(self.url)
     with self.assertRaises(ValueError):
         storytracker.get(self.img)
     storytracker.get(self.img, verify=False)