Esempio n. 1
0
 def close_warc_after_writing(self, out):
     out.flush()
     out.seek(0)
     default_storage.store_file(out,
                                self.warc_storage_file(),
                                overwrite=True)
     out.close()
Esempio n. 2
0
def compress_link_assets(*args, **kwargs):
    """
    This task creates a zipfile containing the assets of a given Perma
    link. The zip file does *not* contain mutable status data about
    the link (e.g. whether it's vested or not), only immutable asset
    metadata.  This is a Celery task so that it can be run after the
    tasks that generate the assets are finished, which we arrange for
    by means of a chord. Thus, the first positional arguments of this
    function will be return value of those tasks. We thus don't rely
    on positional arguments and retrieve all of our arguments via
    kwargs.
    """
    # fetch link and asset
    try:
        guid = kwargs['guid']
    except KeyError:
        raise TypeError("compress_link_assets() requires a guid keyword argument")
    target_link = get_object_or_404(Link, guid=guid)
    target_asset = get_object_or_404(Asset, link=target_link)

    # build metadata
    metadata = {
        "guid": guid,
        "submitted_url": target_link.submitted_url,
        "creation_timestamp": serialize_datetime(target_link.creation_timestamp),
        "submitted_title": target_link.submitted_title,
    }

    # Here we are going to open a temporary file to store our zip data to.
    # Because of @run_in_tempdir we have already chdir'd to a temp dir.
    # Next we will use default_storage to copy each file in target_asset.base_storage_path
    # (which may come from the local disk or a remote location like S3)
    # to the temp dir, and then from there into the open zip file.
    # This double copy is necessary because zipfile.write expects a file path,
    # not a file handle.
    temp_file = tempfile.TemporaryFile()
    base_storage_path_without_guid = os.path.dirname(target_asset.base_storage_path)
    with zipfile.ZipFile(temp_file, "w") as zipfh:
        for root, dirs, files in default_storage.walk(target_asset.base_storage_path):
            for file in files:
                source_file_path = os.path.join(root, file) # e.g. 2014/6/10/18/37/1234-ABCD/cap.png
                dest_file_path = source_file_path.replace(base_storage_path_without_guid+"/", '', 1) # e.g. 1234-ABCD/cap.png
                with default_storage.open(source_file_path, 'rb') as source_file:
                    zipfh.writestr(dest_file_path, source_file.read())

        # write metadata to 1234-ABCD/metadata.json
        zipfh.writestr(os.path.join(guid, "metadata.json"), json.dumps(metadata))

    # now our zip file has been written, we can store it to default_storage
    temp_file.seek(0)
    zipfile_storage_path = os.path.join(settings.MEDIA_ARCHIVES_ROOT, target_asset.base_storage_path+".zip")
    default_storage.store_file(temp_file, zipfile_storage_path, overwrite=True)
Esempio n. 3
0
def preserve_perma_warc(guid, timestamp, destination):
    """
    Context manager for opening a perma warc, ready to receive warc records.
    Safely closes and saves the file to storage when context is exited.
    """
    out = tempfile.TemporaryFile()
    write_perma_warc_header(out, guid, timestamp)
    try:
        yield out
    finally:
        out.flush()
        out.seek(0)
        default_storage.store_file(out, destination, overwrite=True)
        out.close()
Esempio n. 4
0
def preserve_perma_warc(guid, timestamp, destination):
    """
        Inside this context manager, the environment variable MAGICK_TEMPORARY_PATH will be set to a
        temp path that gets deleted when the context closes. This stops Wand's calls to ImageMagick
        leaving temp files around.
    """
    out = tempfile.TemporaryFile()
    write_perma_warc_header(out, guid, timestamp)
    try:
        yield out
    finally:
        out.flush()
        out.seek(0)
        default_storage.store_file(out, destination, overwrite=True)
        out.close()
Esempio n. 5
0
def preserve_perma_warc(guid, timestamp, destination):
    """
    Context manager for opening a perma warc, ready to receive warc records.
    Safely closes and saves the file to storage when context is exited.
    """
    # mode set to 'ab+' as a workaround for https://bugs.python.org/issue25341
    out = tempfile.TemporaryFile('ab+')
    write_perma_warc_header(out, guid, timestamp)
    try:
        yield out
    finally:
        out.flush()
        out.seek(0)
        default_storage.store_file(out, destination, overwrite=True)
        out.close()
Esempio n. 6
0
def preserve_perma_warc(guid, timestamp, destination):
    """
        Inside this context manager, the environment variable MAGICK_TEMPORARY_PATH will be set to a
        temp path that gets deleted when the context closes. This stops Wand's calls to ImageMagick
        leaving temp files around.
    """
    out = tempfile.TemporaryFile()
    write_perma_warc_header(out, guid, timestamp)
    try:
        yield out
    finally:
        out.flush()
        out.seek(0)
        default_storage.store_file(out, destination, overwrite=True)
        out.close()
Esempio n. 7
0
def get_pdf(link_guid, target_url, base_storage_path, user_agent):
    """
    Download a PDF from the network

    This function is executed via an asynchronous Celery call
    """

    # basic setup
    asset_query = get_asset_query(link_guid)
    pdf_name = 'cap.pdf'
    pdf_path = os.path.join(base_storage_path, pdf_name)

    # Get the PDF from the network
    pdf_request = requests.get(target_url, stream = True, verify=False,
        headers={'User-Agent': user_agent})

    # write PDF out to a temp file
    temp = tempfile.TemporaryFile()
    for chunk in pdf_request.iter_content(chunk_size=1024):

        if chunk: # filter out keep-alive new chunks
            temp.write(chunk)
            temp.flush()

        # Limit our filesize
        if temp.tell() > settings.MAX_ARCHIVE_FILE_SIZE:
            logger.info("PDF capture too big, %s" % target_url)
            asset_query.update(pdf_capture='failed')
            return

    # store temp file
    temp.seek(0)
    pdf_name = default_storage.store_file(temp, pdf_path)
    asset_query.update(pdf_capture=pdf_name)
Esempio n. 8
0
    def obj_create(self, bundle, **kwargs):
        # We've received a request to archive a URL. That process is managed here.
        # We create a new entry in our datastore and pass the work off to our indexing
        # workers. They do their thing, updating the model as they go. When we get some minimum
        # set of results we can present the user (a guid for the link), we respond back.
        if settings.READ_ONLY_MODE:
            raise ImmediateHttpResponse(response=self.error_response(
                bundle.request, {
                    'archives': {
                        '__all__':
                        "Perma has paused archive creation for scheduled maintenance. Please try again shortly."
                    },
                    'reason':
                    "Perma has paused archive creation for scheduled maintenance. Please try again shortly.",
                }))

        # Runs validation (exception thrown if invalid), sets properties and saves the object
        bundle = super(LinkResource,
                       self).obj_create(bundle, created_by=bundle.request.user)
        asset = Asset(link=bundle.obj)

        uploaded_file = bundle.data.get('file')
        if uploaded_file:
            # normalize file name to upload.jpg, upload.png, upload.gif, or upload.pdf
            mime_type = get_mime_type(uploaded_file.name)
            file_name = 'upload.%s' % mime_type_lookup[mime_type][
                'new_extension']
            file_path = os.path.join(asset.base_storage_path, file_name)

            uploaded_file.file.seek(0)
            file_name = default_storage.store_file(uploaded_file, file_path)

            if mime_type == 'application/pdf':
                asset.pdf_capture = file_name
            else:
                asset.image_capture = file_name
            asset.user_upload = True
            asset.user_upload_file_name = uploaded_file.name
            asset.save()
        else:
            asset.image_capture = Asset.CAPTURE_STATUS_PENDING
            # If it appears as if we're trying to archive a PDF, only run our PDF retrieval tool
            if asset.link.media_type == 'pdf':
                asset.pdf_capture = Asset.CAPTURE_STATUS_PENDING
                task = get_pdf
            else:  # else, it's not a PDF. Let's try our best to retrieve what we can
                asset.warc_capture = Asset.CAPTURE_STATUS_PENDING
                task = proxy_capture

            asset.save()
            run_task(
                task.s(asset.link.guid, asset.link.submitted_url,
                       asset.base_storage_path,
                       bundle.request.META.get('HTTP_USER_AGENT', '')))

        return bundle
Esempio n. 9
0
def get_pdf(self, link_guid, target_url, base_storage_path, user_agent):
    """
    Download a PDF from the network

    This function is executed via an asynchronous Celery call
    """

    # basic setup
    asset = Asset.objects.get(link_id=link_guid)
    pdf_name = 'cap.pdf'
    pdf_path = os.path.join(base_storage_path, pdf_name)

    # Get the PDF from the network
    pdf_request = requests.get(target_url,
                               stream=True,
                               verify=False,
                               headers={'User-Agent': user_agent})

    # write PDF out to a temp file
    temp = tempfile.NamedTemporaryFile()
    for chunk in pdf_request.iter_content(chunk_size=1024):

        if chunk:  # filter out keep-alive new chunks
            temp.write(chunk)
            temp.flush()

        # Limit our filesize
        if temp.tell() > settings.MAX_ARCHIVE_FILE_SIZE:
            logger.info("PDF capture too big, %s" % target_url)
            save_fields(asset,
                        pdf_capture=Asset.CAPTURE_STATUS_FAILED,
                        image_capture=Asset.CAPTURE_STATUS_FAILED)
            return

    # store temp file
    temp.seek(0)
    pdf_name = default_storage.store_file(temp, pdf_path, overwrite=True)
    save_fields(asset, pdf_capture=pdf_name)

    # Get first page of the PDF and created an image from it
    # Save it to disk as our image capture (likely a temporary measure)
    # The [0] in the filename gets passed through to ImageMagick and limits PDFs to the first page.
    try:
        with imagemagick_temp_dir():
            with Image(filename=temp.name + "[0]") as img:
                image_name = 'cap.png'
                image_path = os.path.join(base_storage_path, image_name)
                default_storage.store_data_to_file(img.make_blob('png'),
                                                   image_path,
                                                   overwrite=True)
                save_fields(asset, image_capture=image_name)
    except Exception as e:
        # errors with the thumbnail aren't dealbreakers -- just log here
        print "Error creating PDF thumbnail of %s: %s" % (target_url, e)
        save_fields(asset, image_capture=Asset.CAPTURE_STATUS_FAILED)
Esempio n. 10
0
def get_pdf(self, link_guid, target_url, base_storage_path, user_agent):
    """
    Download a PDF from the network

    This function is executed via an asynchronous Celery call
    """

    # basic setup
    asset = Asset.objects.get(link_id=link_guid)
    pdf_name = 'cap.pdf'
    pdf_path = os.path.join(base_storage_path, pdf_name)

    # Get the PDF from the network
    pdf_request = requests.get(target_url, stream = True, verify=False,
        headers={'User-Agent': user_agent})

    # write PDF out to a temp file
    temp = tempfile.NamedTemporaryFile()
    for chunk in pdf_request.iter_content(chunk_size=1024):

        if chunk: # filter out keep-alive new chunks
            temp.write(chunk)
            temp.flush()

        # Limit our filesize
        if temp.tell() > settings.MAX_ARCHIVE_FILE_SIZE:
            logger.info("PDF capture too big, %s" % target_url)
            save_fields(asset,
                        pdf_capture=Asset.CAPTURE_STATUS_FAILED,
                        image_capture=Asset.CAPTURE_STATUS_FAILED)
            return

    # store temp file
    temp.seek(0)
    pdf_name = default_storage.store_file(temp, pdf_path, overwrite=True)
    save_fields(asset, pdf_capture=pdf_name)
    
    # Get first page of the PDF and created an image from it
    # Save it to disk as our image capture (likely a temporary measure)
    # The [0] in the filename gets passed through to ImageMagick and limits PDFs to the first page.
    try:
        with imagemagick_temp_dir():
            with Image(filename=temp.name+"[0]") as img:
                image_name = 'cap.png'
                image_path = os.path.join(base_storage_path, image_name)
                default_storage.store_data_to_file(img.make_blob('png'), image_path, overwrite=True)
                save_fields(asset, image_capture=image_name)
    except Exception as e:
        # errors with the thumbnail aren't dealbreakers -- just log here
        print "Error creating PDF thumbnail of %s: %s" % (target_url, e)
        save_fields(asset, image_capture=Asset.CAPTURE_STATUS_FAILED)
Esempio n. 11
0
    def obj_create(self, bundle, **kwargs):
        # We've received a request to archive a URL. That process is managed here.
        # We create a new entry in our datastore and pass the work off to our indexing
        # workers. They do their thing, updating the model as they go. When we get some minimum
        # set of results we can present the user (a guid for the link), we respond back.
        if settings.READ_ONLY_MODE:
            raise ImmediateHttpResponse(response=self.error_response(bundle.request, {
                'archives': {'__all__': "Perma has paused archive creation for scheduled maintenance. Please try again shortly."},
                'reason': "Perma has paused archive creation for scheduled maintenance. Please try again shortly.",
            }))

        # Runs validation (exception thrown if invalid), sets properties and saves the object
        bundle = super(LinkResource, self).obj_create(bundle, created_by=bundle.request.user)
        asset = Asset(link=bundle.obj)

        uploaded_file = bundle.data.get('file')
        if uploaded_file:
            # normalize file name to upload.jpg, upload.png, upload.gif, or upload.pdf
            mime_type = get_mime_type(uploaded_file.name)
            file_name = 'upload.%s' % mime_type_lookup[mime_type]['new_extension']
            file_path = os.path.join(asset.base_storage_path, file_name)

            uploaded_file.file.seek(0)
            file_name = default_storage.store_file(uploaded_file, file_path)

            if mime_type == 'application/pdf':
                asset.pdf_capture = file_name
            else:
                asset.image_capture = file_name
            asset.user_upload = True
            asset.user_upload_file_name = uploaded_file.name
            asset.save()
        else:
            asset.image_capture = Asset.CAPTURE_STATUS_PENDING
            # If it appears as if we're trying to archive a PDF, only run our PDF retrieval tool
            if asset.link.media_type == 'pdf':
                asset.pdf_capture = Asset.CAPTURE_STATUS_PENDING
                task = get_pdf
            else:  # else, it's not a PDF. Let's try our best to retrieve what we can
                asset.warc_capture = Asset.CAPTURE_STATUS_PENDING
                task = proxy_capture

            asset.save()
            run_task(task.s(asset.link.guid,
                            asset.link.submitted_url,
                            asset.base_storage_path,
                            bundle.request.META.get('HTTP_USER_AGENT', '')))

        return bundle
Esempio n. 12
0
def get_pdf(self, link_guid, target_url, base_storage_path, user_agent):
    """
    Download a PDF from the network

    This function is executed via an asynchronous Celery call
    """

    # basic setup
    asset_query = get_asset_query(link_guid)
    pdf_name = 'cap.pdf'
    pdf_path = os.path.join(base_storage_path, pdf_name)

    # Get the PDF from the network
    pdf_request = requests.get(target_url, stream = True, verify=False,
        headers={'User-Agent': user_agent})

    # write PDF out to a temp file
    temp = tempfile.NamedTemporaryFile()
    for chunk in pdf_request.iter_content(chunk_size=1024):

        if chunk: # filter out keep-alive new chunks
            temp.write(chunk)
            temp.flush()

        # Limit our filesize
        if temp.tell() > settings.MAX_ARCHIVE_FILE_SIZE:
            logger.info("PDF capture too big, %s" % target_url)
            asset_query.update(pdf_capture='failed', image_capture='failed')
            return

    # store temp file
    temp.seek(0)
    pdf_name = default_storage.store_file(temp, pdf_path)
    asset_query.update(pdf_capture=pdf_name)
    
    # Get first page of the PDF and created an image from it
    # Save it to disk as our image capture (likely a temporary measure)
    with Image(filename=temp.name) as img:
        image_name = 'cap.png'
        image_path = os.path.join(base_storage_path, image_name)
        default_storage.store_data_to_file(img.make_blob('png'), image_path, overwrite=True)
        asset_query.update(image_capture=image_name)
Esempio n. 13
0
def upload_file(request):
    if request.method == 'POST':
        form = UploadFileForm(request.POST, request.FILES)
        if form.is_valid():

            mime = MimeTypes()
            uploaded_file = request.FILES['file']
            mime_type = mime.guess_type(uploaded_file.name)

            # Get mime type string from tuple
            if mime_type[0]:
                mime_type = mime_type[0]
            else:
                return HttpResponseBadRequest(json.dumps({'status':'failed', 'reason':'Invalid file.'}), 'application/json')

            if validate_upload_file(uploaded_file, mime_type) and uploaded_file.size <= settings.MAX_ARCHIVE_FILE_SIZE:
                link = Link(submitted_url=form.cleaned_data['url'], submitted_title=form.cleaned_data['title'], created_by = request.user)
                link.save()

                asset = Asset(link=link)
                file_name = 'cap' + mime.guess_extension(mime_type)
                file_path = os.path.join(asset.base_storage_path, file_name)

                uploaded_file.file.seek(0)
                file_name = default_storage.store_file(uploaded_file, file_path)

                if mime_type == 'application/pdf':
                    asset.pdf_capture = file_name
                else:
                    asset.image_capture = file_name
                asset.save()

                response_object = {'status':'success', 'linky_id':link.guid, 'linky_hash':link.guid}

                return HttpResponse(json.dumps(response_object), 'application/json', 201)  # '201 Created' status
            else:
                return HttpResponseBadRequest(json.dumps({'status':'failed', 'reason':'Invalid file.'}), 'application/json')
        else:
            return HttpResponseBadRequest(json.dumps({'status':'failed', 'reason':'Missing file.'}), 'application/json')

    return HttpResponseBadRequest(json.dumps({'status':'failed', 'reason':'No file submitted.'}), 'application/json')
Esempio n. 14
0
def upload_file(request):
    if request.method == 'POST':
        form = UploadFileForm(request.POST, request.FILES)
        if form.is_valid():

            mime = MimeTypes()
            uploaded_file = request.FILES['file']
            mime_type = mime.guess_type(uploaded_file.name)

            # Get mime type string from tuple
            if mime_type[0]:
                mime_type = mime_type[0]
            else:
                return HttpResponseBadRequest(
                    json.dumps({
                        'status': 'failed',
                        'reason': 'Invalid file.'
                    }), 'application/json')

            if validate_upload_file(
                    uploaded_file, mime_type
            ) and uploaded_file.size <= settings.MAX_ARCHIVE_FILE_SIZE:
                link = Link(submitted_url=form.cleaned_data['url'],
                            submitted_title=form.cleaned_data['title'],
                            created_by=request.user)
                link.save()

                asset = Asset(link=link)
                file_name = 'cap' + mime.guess_extension(mime_type)
                file_path = os.path.join(asset.base_storage_path, file_name)

                uploaded_file.file.seek(0)
                file_name = default_storage.store_file(uploaded_file,
                                                       file_path)

                if mime_type == 'application/pdf':
                    asset.pdf_capture = file_name
                else:
                    asset.image_capture = file_name
                asset.save()

                response_object = {
                    'status': 'success',
                    'linky_id': link.guid,
                    'linky_hash': link.guid
                }

                return HttpResponse(json.dumps(response_object),
                                    'application/json',
                                    201)  # '201 Created' status
            else:
                return HttpResponseBadRequest(
                    json.dumps({
                        'status': 'failed',
                        'reason': 'Invalid file.'
                    }), 'application/json')
        else:
            return HttpResponseBadRequest(
                json.dumps({
                    'status': 'failed',
                    'reason': 'Missing file.'
                }), 'application/json')

    return HttpResponseBadRequest(
        json.dumps({
            'status': 'failed',
            'reason': 'No file submitted.'
        }), 'application/json')
Esempio n. 15
0
def proxy_capture(self,
                  link_guid,
                  target_url,
                  base_storage_path,
                  user_agent=''):
    """
    start warcprox process. Warcprox is a MITM proxy server and needs to be running 
    before, during and after phantomjs gets a screenshot.

    Create an image from the supplied URL, write it to disk and update our asset model with the path.
    The heavy lifting is done by PhantomJS, our headless browser.

    This whole function runs with the local dir set to a temp dir by run_in_tempdir().
    So we can use local paths for temp files, and they'll just disappear when the function exits.

    TODO: This function is probably inefficient in saving to the database after each change to asset/link.
    """
    # basic setup

    asset = Asset.objects.get(link_id=link_guid)
    link = asset.link
    image_name = 'cap.png'
    warc_name = 'archive.warc.gz'
    image_path = os.path.join(base_storage_path, image_name)
    warc_path = os.path.join(base_storage_path, warc_name)

    print "%s: Fetching %s" % (link_guid, target_url)

    # suppress verbose warcprox logs
    logging.disable(logging.INFO)

    # Set up an exception we can trigger to halt capture and release all the resources involved.
    class HaltCaptureException(Exception):
        pass

    meta_thread = browser = robots_txt_thread = warcprox_controller = warcprox_thread = favicon_thread = None
    have_warc = False

    try:

        # create a request handler class that counts unique requests and responses
        unique_requests = set()
        unique_responses = set()
        count_lock = threading.Lock()

        class CountingRequestHandler(warcprox.WarcProxyHandler):
            def _proxy_request(self):
                with count_lock:
                    unique_requests.add(self.url)
                warcprox.WarcProxyHandler._proxy_request(self)
                with count_lock:
                    unique_responses.add(self.url)

        # connect warcprox to an open port
        warcprox_port = 27500
        recorded_url_queue = warcprox.queue.Queue()
        fake_cert_authority = warcprox.CertificateAuthority()
        for i in xrange(500):
            try:
                proxy = warcprox.WarcProxy(
                    server_address=("127.0.0.1", warcprox_port),
                    ca=fake_cert_authority,
                    recorded_url_q=recorded_url_queue,
                    req_handler_class=CountingRequestHandler)
                break
            except socket_error as e:
                if e.errno != errno.EADDRINUSE:
                    raise
            warcprox_port += 1
        else:
            raise self.retry(
                exc=Exception("WarcProx couldn't find an open port."))
        proxy_address = "127.0.0.1:%s" % warcprox_port

        # set up requests getter for one-off requests outside of selenium
        parsed_target_url = urlparse.urlparse(target_url)
        target_url_base = parsed_target_url.scheme + '://' + parsed_target_url.netloc + '/'

        def proxied_get_request(url):
            return requests.get(
                url,
                headers={'User-Agent': user_agent},
                proxies={parsed_target_url.scheme: 'http://' + proxy_address},
                cert=fake_cert_authority.ca_file)

        # start warcprox in the background
        warc_writer = warcprox.WarcWriterThread(
            recorded_url_q=recorded_url_queue, gzip=True, port=warcprox_port)
        warcprox_controller = warcprox.WarcproxController(proxy, warc_writer)
        warcprox_thread = threading.Thread(
            target=warcprox_controller.run_until_shutdown,
            name="warcprox",
            args=())
        warcprox_thread.start()

        # print "WarcProx opened."

        # fetch robots.txt in the background
        def robots_txt_thread():
            #print "Fetching robots.txt ..."
            robots_txt_location = target_url_base + 'robots.txt'
            try:
                robots_txt_response = proxied_get_request(robots_txt_location)
                assert robots_txt_response.ok
            except (requests.ConnectionError, requests.Timeout,
                    AssertionError):
                #print "Couldn't reach robots.txt"
                return

            # We only want to respect robots.txt if Perma is specifically asked not to archive (we're not a crawler)
            if 'Perma' in robots_txt_response.content:
                # We found Perma specifically mentioned
                rp = robotparser.RobotFileParser()
                rp.parse([
                    line.strip()
                    for line in robots_txt_response.content.split('\n')
                ])
                if not rp.can_fetch('Perma', target_url):
                    save_fields(link, dark_archived_robots_txt_blocked=True)
            # print "Robots.txt fetched."

        robots_txt_thread = threading.Thread(target=robots_txt_thread,
                                             name="robots")
        robots_txt_thread.start()

        # fetch page in the background
        # print "Fetching url."
        browser = get_browser(user_agent, proxy_address,
                              fake_cert_authority.ca_file)
        browser.set_window_size(1024, 800)
        page_load_thread = threading.Thread(
            target=browser.get, args=(target_url, ))  # returns after onload
        page_load_thread.start()
        page_load_thread.join(PAGE_LOAD_TIMEOUT)
        if page_load_thread.is_alive():
            # print "Waited 60 seconds for onLoad event -- giving up."
            if not unique_responses:
                # if nothing at all has loaded yet, give up on the capture
                save_fields(asset,
                            warc_capture='failed',
                            image_capture='failed')
                raise HaltCaptureException
        # print "Finished fetching url."

        # get favicon
        favicons = browser.find_elements_by_xpath(
            '//link[@rel="icon" or @rel="shortcut icon"]')
        favicons = [i for i in favicons if i.get_attribute('href')]
        if favicons:
            favicon_url = urlparse.urljoin(browser.current_url,
                                           favicons[0].get_attribute('href'))
            favicon_extension = favicon_url.rsplit('.', 1)[-1]
            if not favicon_extension in ['ico', 'gif', 'jpg', 'jpeg', 'png']:
                favicon_url = None
        else:
            favicon_url = urlparse.urljoin(browser.current_url, '/favicon.ico')
        if favicon_url:
            # try to fetch favicon in background
            def favicon_thread():
                # print "Fetching favicon from %s ..." % favicon_url
                try:
                    favicon_response = proxied_get_request(favicon_url)
                    assert favicon_response.ok
                except (requests.ConnectionError, requests.Timeout,
                        AssertionError):
                    # print "Couldn't get favicon"
                    return
                favicon_file = favicon_url.rsplit('/', 1)[-1]
                default_storage.store_data_to_file(favicon_response.content,
                                                   os.path.join(
                                                       base_storage_path,
                                                       favicon_file),
                                                   overwrite=True)
                save_fields(asset, favicon=favicon_file)
                print "Saved favicon as %s" % favicon_file

            favicon_thread = threading.Thread(target=favicon_thread,
                                              name="favicon")
            favicon_thread.start()

        # get page title
        # print "Getting title."
        if browser.title:
            save_fields(link, submitted_title=browser.title)

        # check meta tags
        # (run this in a thread and give it long enough to find the tags, but then let other stuff proceed)
        # print "Checking meta tags."
        def meta_thread():
            # get all meta tags
            meta_tags = browser.find_elements_by_tag_name('meta')
            # first look for <meta name='perma'>
            meta_tag = next((tag for tag in meta_tags
                             if tag.get_attribute('name').lower() == 'perma'),
                            None)
            # else look for <meta name='robots'>
            if not meta_tag:
                meta_tag = next(
                    (tag for tag in meta_tags
                     if tag.get_attribute('name').lower() == 'robots'), None)
            # if we found a relevant meta tag, check for noarchive
            if meta_tag and 'noarchive' in meta_tag.get_attribute(
                    "content").lower():
                save_fields(link, dark_archived_robots_txt_blocked=True)
                # print "Meta found, darchiving"

        meta_thread = threading.Thread(target=meta_thread)
        meta_thread.start()
        meta_thread.join(ELEMENT_DISCOVERY_TIMEOUT * 2)

        # scroll to bottom of page and back up, in case that prompts anything else to load
        try:
            browser.execute_script(
                "window.scrollTo(0, document.body.scrollHeight);")
            browser.execute_script("window.scrollTo(0, 0);")
        except WebDriverException:
            pass

        # get page size to decide whether to take a screenshot
        capture_screenshot = False
        try:
            root_element = browser.find_element_by_tag_name('body')
        except NoSuchElementException:
            try:
                root_element = browser.find_element_by_tag_name('frameset')
            except NoSuchElementException:
                root_element = None
        if root_element:
            page_size = root_element.size
            pixel_count = page_size['width'] * page_size['height']
            capture_screenshot = pixel_count < settings.MAX_IMAGE_SIZE
        if not capture_screenshot:
            # print "Not saving screenshots! Page size is %s pixels." % pixel_count
            save_fields(asset, image_capture='failed')

        # save preliminary screenshot immediately, and an updated version later
        # (we want to return results quickly, but also give javascript time to render final results)
        if capture_screenshot:
            # print "Saving first screenshot."
            save_screenshot(browser, image_path)
            save_fields(asset, image_capture=image_name)

        # make sure all requests are finished
        # print "Waiting for post-load requests."
        start_time = time.time()
        time.sleep(min(AFTER_LOAD_TIMEOUT, 5))
        while len(unique_responses) < len(unique_requests):
            # print "%s/%s finished" % (len(unique_responses), len(unique_requests))
            if time.time() - start_time > AFTER_LOAD_TIMEOUT:
                # print "Waited %s seconds to finish post-load requests -- giving up." % AFTER_LOAD_TIMEOUT
                break
            time.sleep(.5)

        # take second screenshot after all requests done
        if capture_screenshot:
            # print "Taking second screenshot."
            save_screenshot(browser, image_path)

        have_warc = True

    except HaltCaptureException:
        pass

    finally:
        # teardown (have to do this before save to make sure WARC is done writing):
        # print "Shutting down browser and proxies."

        if browser:
            browser.quit()  # shut down phantomjs

            # This can be removed when this bugfix ships in selenium:
            # https://code.google.com/p/selenium/issues/detail?id=8498
            browser.service.process.stdin.close()
        if meta_thread:
            meta_thread.join()  # wait until meta thread is done
        if robots_txt_thread:
            robots_txt_thread.join()  # wait until robots thread is done
        if favicon_thread:
            favicon_thread.join()  # wait until favicon thread is done
        if warcprox_controller:
            warcprox_controller.stop.set(
            )  # send signal to shut down warc thread
        if warcprox_thread:
            warcprox_thread.join(
            )  # wait until warcprox thread is done writing out warc

        # un-suppress logging
        logging.disable(logging.NOTSET)

    # save generated warc file
    if have_warc:
        # print "Saving WARC."
        try:
            temp_warc_path = os.path.join(warc_writer.directory,
                                          warc_writer._f_finalname)
            with open(temp_warc_path, 'rb') as warc_file:
                warc_name = default_storage.store_file(warc_file, warc_path)
                save_fields(asset, warc_capture=warc_name)

            # print "Writing CDX lines to the DB"
            CDXLine.objects.create_all_from_asset(asset)

        except Exception as e:
            logger.info("Web Archive File creation failed for %s: %s" %
                        (target_url, e))
            save_fields(asset, warc_capture='failed')
Esempio n. 16
0
 def close_warc_after_writing(self, out):
     out.flush()
     out.seek(0)
     default_storage.store_file(out, self.warc_storage_file(), overwrite=True)
     out.close()
Esempio n. 17
0
 def safe_delete_warc(self):
     old_name = self.warc_storage_file()
     if default_storage.exists(old_name):
         new_name = old_name.replace('.warc.gz', '_replaced.warc.gz')
         default_storage.store_file(default_storage.open(old_name), new_name)
         default_storage.delete(old_name)
Esempio n. 18
0
def update_perma(link_guid):
    """
    Update the vested/darchived status of a perma link, and download the
    assets if necessary
    """
    # N.B. This function has two instances of downloading stuff from
    # the root server using a scheme that looks something like
    #    settings.SERVER + reverse("url_pattern")
    # This is nice because it means we don't have to repeat our URL
    # patterns from urls.py, but it hardcodes the fact that the root
    # server is another Perma instance. It's unclear to me which is a
    # better fact to abstract, but this is easier for now.

    ## First, let's get the metadata for this link. The metadata
    ## contains information about where we should place the assets (if
    ## we decide that we need them). This is also a fast check to make
    ## sure the link GUID is actually real.
    metadata_server = settings.UPSTREAM_SERVER['address']
    metadata_url = metadata_server + reverse("service_link_status", args=(link_guid,))
    metadata = requests.get(
        metadata_url,
        headers=settings.UPSTREAM_SERVER.get('headers', {})
    ).json()

    ## Next, let's see if we need to get the assets. If we have the
    ## Link object for this GUID, we're going to assume we already
    ## have what we need. It would make a little more sense to use the
    ## Asset object here instead, but we're definitely going to need
    ## to do stuff to the Link object so we might as well get that
    ## instead. In practice they should be ~one to one.
    try:
        link = Link.objects.get(guid=link_guid)
    except Link.DoesNotExist:
        ## We need to download the assets. We can download an archive
        ## from the assets server.
        assets_server = settings.UPSTREAM_SERVER['address']
        assets_url = assets_server + reverse("mirroring:link_assets", args=(link_guid,))

        # Temp paths can be relative because we're in run_in_tempdir()
        temp_zip_path = 'temp.zip'

        # Save remote zip file to disk, using streaming to avoid keeping large files in RAM.
        request = requests.get(
            assets_url,
            headers=settings.UPSTREAM_SERVER.get('headers', {}),
            stream=True)
        with open(temp_zip_path, 'wb') as f:
            for chunk in request.iter_content(1024):
                f.write(chunk)

        ## Extract the archive and change into the extracted folder.
        with zipfile.ZipFile(temp_zip_path, "r") as zipfh:
            #assets_path = os.path.dirname(os.path.join(settings.MEDIA_ROOT, metadata["path"]))
            zipfh.extractall() # creates folder named [guid] in current temp dir
        temp_extracted_path = os.path.basename(metadata['path']) # e.g. "1234-ABCD"

        # Save all extracted files to default_storage, using the path in metadata.
        for root, dirs, files in os.walk(temp_extracted_path):
            for file in files:
                source_file_path = os.path.join(root, file) # e.g. "1234-ABCD/cap.png"
                dest_file_path = os.path.join(os.path.dirname(metadata['path']), source_file_path) # e.g. 2014/6/10/18/37/1234-ABCD/cap.png
                with open(source_file_path, 'rb') as source_file:
                    default_storage.store_file(source_file, dest_file_path)

        ## We can now get some additional metadata that we'll need to
        ## create the Link object.
        with open(os.path.join(temp_extracted_path, "metadata.json"), "r") as fh:
            link_metadata = json.load(fh)

        ## We now have everything we need to initialize the Link object.
        link = Link(guid=link_guid)
        link.submitted_url = link_metadata["submitted_url"]
        link.submitted_title = link_metadata["submitted_title"]
        link.created_by = None # XXX maybe we should do something with FakeUser here
        link.save(pregenerated_guid=True) # We need to save this so that we can create an Asset object

        # This is a stupid hack to overcome the fact that the Link has
        # auto_now_add=True, so it's always going to be saved to the
        # current time on first creation.
        link.creation_timestamp = unserialize_datetime(link_metadata["creation_timestamp"])
        link.save()

        ## Lastly, let's create an Asset object for this Link.
        asset = Asset(link=link)
        asset.base_storage_path = metadata["path"]
        asset.image_capture = metadata["image_capture"]
        asset.warc_capture = metadata["source_capture"]
        asset.pdf_capture = metadata["pdf_capture"]
        asset.text_capture = metadata["text_capture"]
        asset.save()

    ## We can now add some of the data we got from the metadata to the Link object
    link.dark_archived = metadata["dark_archived"]
    link.vested = metadata["vested"]
    link.save()

    # If we have sub-mirrors, poke them to get a copy from us.
    if settings.DOWNSTREAM_SERVERS:
        run_task(poke_mirrors, link_guid=link_guid)
Esempio n. 19
0
def proxy_capture(self, link_guid, target_url, base_storage_path, user_agent=''):
    """
    start warcprox process. Warcprox is a MITM proxy server and needs to be running 
    before, during and after phantomjs gets a screenshot.

    Create an image from the supplied URL, write it to disk and update our asset model with the path.
    The heavy lifting is done by PhantomJS, our headless browser.

    This whole function runs with the local dir set to a temp dir by run_in_tempdir().
    So we can use local paths for temp files, and they'll just disappear when the function exits.

    TODO: This function is probably inefficient in saving to the database after each change to asset/link.
    """
    # basic setup

    asset = Asset.objects.get(link_id=link_guid)
    link = asset.link
    image_name = 'cap.png'
    warc_name = 'archive.warc.gz'
    image_path = os.path.join(base_storage_path, image_name)
    warc_path = os.path.join(base_storage_path, warc_name)

    print "%s: Fetching %s" % (link_guid, target_url)

    # suppress verbose warcprox logs
    logging.disable(logging.INFO)

    # Set up an exception we can trigger to halt capture and release all the resources involved.
    class HaltCaptureException(Exception):
        pass
    meta_thread = browser = robots_txt_thread = warcprox_controller = warcprox_thread = favicon_thread = None
    have_warc = False

    try:

        # create a request handler class that counts unique requests and responses
        unique_requests = set()
        unique_responses = set()
        count_lock = threading.Lock()
        class CountingRequestHandler(warcprox.WarcProxyHandler):
            def _proxy_request(self):
                with count_lock:
                    unique_requests.add(self.url)
                warcprox.WarcProxyHandler._proxy_request(self)
                with count_lock:
                    unique_responses.add(self.url)

        # connect warcprox to an open port
        warcprox_port = 27500
        recorded_url_queue = warcprox.queue.Queue()
        fake_cert_authority = warcprox.CertificateAuthority()
        for i in xrange(500):
            try:
                proxy = warcprox.WarcProxy(
                    server_address=("127.0.0.1", warcprox_port),
                    ca=fake_cert_authority,
                    recorded_url_q=recorded_url_queue,
                    req_handler_class=CountingRequestHandler
                )
                break
            except socket_error as e:
                if e.errno != errno.EADDRINUSE:
                    raise
            warcprox_port += 1
        else:
            raise self.retry(exc=Exception("WarcProx couldn't find an open port."))
        proxy_address = "127.0.0.1:%s" % warcprox_port

        # set up requests getter for one-off requests outside of selenium
        parsed_target_url = urlparse.urlparse(target_url)
        target_url_base = parsed_target_url.scheme + '://' + parsed_target_url.netloc + '/'
        def proxied_get_request(url):
            return requests.get(url,
                                headers={'User-Agent': user_agent},
                                proxies={parsed_target_url.scheme: 'http://' + proxy_address},
                                cert=fake_cert_authority.ca_file)

        # start warcprox in the background
        warc_writer = warcprox.WarcWriterThread(recorded_url_q=recorded_url_queue, gzip=True, port=warcprox_port)
        warcprox_controller = warcprox.WarcproxController(proxy, warc_writer)
        warcprox_thread = threading.Thread(target=warcprox_controller.run_until_shutdown, name="warcprox", args=())
        warcprox_thread.start()

        # print "WarcProx opened."

        # fetch robots.txt in the background
        def robots_txt_thread():
            #print "Fetching robots.txt ..."
            robots_txt_location = target_url_base + 'robots.txt'
            try:
                robots_txt_response = proxied_get_request(robots_txt_location)
                assert robots_txt_response.ok
            except (requests.ConnectionError, requests.Timeout, AssertionError):
                #print "Couldn't reach robots.txt"
                return

            # We only want to respect robots.txt if Perma is specifically asked not to archive (we're not a crawler)
            if 'Perma' in robots_txt_response.content:
                # We found Perma specifically mentioned
                rp = robotparser.RobotFileParser()
                rp.parse([line.strip() for line in robots_txt_response.content.split('\n')])
                if not rp.can_fetch('Perma', target_url):
                    save_fields(link, dark_archived_robots_txt_blocked=True)
            # print "Robots.txt fetched."
        robots_txt_thread = threading.Thread(target=robots_txt_thread, name="robots")
        robots_txt_thread.start()

        # fetch page in the background
        # print "Fetching url."
        browser = get_browser(user_agent, proxy_address, fake_cert_authority.ca_file)
        browser.set_window_size(1024, 800)
        page_load_thread = threading.Thread(target=browser.get, args=(target_url,))  # returns after onload
        page_load_thread.start()
        page_load_thread.join(PAGE_LOAD_TIMEOUT)
        if page_load_thread.is_alive():
            # print "Waited 60 seconds for onLoad event -- giving up."
            if not unique_responses:
                # if nothing at all has loaded yet, give up on the capture
                save_fields(asset, warc_capture='failed', image_capture='failed')
                raise HaltCaptureException
        # print "Finished fetching url."

        # get favicon
        favicons = browser.find_elements_by_xpath('//link[@rel="icon" or @rel="shortcut icon"]')
        favicons = [i for i in favicons if i.get_attribute('href')]
        if favicons:
            favicon_url = urlparse.urljoin(browser.current_url, favicons[0].get_attribute('href'))
            favicon_extension = favicon_url.rsplit('.',1)[-1]
            if not favicon_extension in ['ico', 'gif', 'jpg', 'jpeg', 'png']:
                favicon_url = None
        else:
            favicon_url = urlparse.urljoin(browser.current_url, '/favicon.ico')
        if favicon_url:
            # try to fetch favicon in background
            def favicon_thread():
                # print "Fetching favicon from %s ..." % favicon_url
                try:
                    favicon_response = proxied_get_request(favicon_url)
                    assert favicon_response.ok
                except (requests.ConnectionError, requests.Timeout, AssertionError):
                    # print "Couldn't get favicon"
                    return
                favicon_file = favicon_url.rsplit('/',1)[-1]
                default_storage.store_data_to_file(favicon_response.content,
                                                   os.path.join(base_storage_path, favicon_file),
                                                   overwrite=True)
                save_fields(asset, favicon=favicon_file)
                print "Saved favicon as %s" % favicon_file
            favicon_thread = threading.Thread(target=favicon_thread, name="favicon")
            favicon_thread.start()

        # get page title
        # print "Getting title."
        if browser.title:
            save_fields(link, submitted_title=browser.title)

        # check meta tags
        # (run this in a thread and give it long enough to find the tags, but then let other stuff proceed)
        # print "Checking meta tags."
        def meta_thread():
            # get all meta tags
            meta_tags = browser.find_elements_by_tag_name('meta')
            # first look for <meta name='perma'>
            meta_tag = next((tag for tag in meta_tags if tag.get_attribute('name').lower()=='perma'), None)
            # else look for <meta name='robots'>
            if not meta_tag:
                meta_tag = next((tag for tag in meta_tags if tag.get_attribute('name').lower() == 'robots'), None)
            # if we found a relevant meta tag, check for noarchive
            if meta_tag and 'noarchive' in meta_tag.get_attribute("content").lower():
                save_fields(link, dark_archived_robots_txt_blocked=True)
                # print "Meta found, darchiving"

        meta_thread = threading.Thread(target=meta_thread)
        meta_thread.start()
        meta_thread.join(ELEMENT_DISCOVERY_TIMEOUT*2)

        # scroll to bottom of page and back up, in case that prompts anything else to load
        try:
            browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            browser.execute_script("window.scrollTo(0, 0);")
        except WebDriverException:
            pass

        # get page size to decide whether to take a screenshot
        capture_screenshot = False
        try:
            root_element = browser.find_element_by_tag_name('body')
        except NoSuchElementException:
            try:
                root_element = browser.find_element_by_tag_name('frameset')
            except NoSuchElementException:
                root_element = None
        if root_element:
            page_size = root_element.size
            pixel_count = page_size['width']*page_size['height']
            capture_screenshot = pixel_count < settings.MAX_IMAGE_SIZE
        if not capture_screenshot:
            # print "Not saving screenshots! Page size is %s pixels." % pixel_count
            save_fields(asset, image_capture='failed')

        # save preliminary screenshot immediately, and an updated version later
        # (we want to return results quickly, but also give javascript time to render final results)
        if capture_screenshot:
            # print "Saving first screenshot."
            save_screenshot(browser, image_path)
            save_fields(asset, image_capture=image_name)

        # make sure all requests are finished
        # print "Waiting for post-load requests."
        start_time = time.time()
        time.sleep(min(AFTER_LOAD_TIMEOUT, 5))
        while len(unique_responses) < len(unique_requests):
            # print "%s/%s finished" % (len(unique_responses), len(unique_requests))
            if time.time() - start_time > AFTER_LOAD_TIMEOUT:
                # print "Waited %s seconds to finish post-load requests -- giving up." % AFTER_LOAD_TIMEOUT
                break
            time.sleep(.5)

        # take second screenshot after all requests done
        if capture_screenshot:
            # print "Taking second screenshot."
            save_screenshot(browser, image_path)

        have_warc = True

    except HaltCaptureException:
        pass

    finally:
        # teardown (have to do this before save to make sure WARC is done writing):
        # print "Shutting down browser and proxies."

        if browser:
            browser.quit()  # shut down phantomjs

            # This can be removed when this bugfix ships in selenium:
            # https://code.google.com/p/selenium/issues/detail?id=8498
            browser.service.process.stdin.close()
        if meta_thread:
            meta_thread.join()  # wait until meta thread is done
        if robots_txt_thread:
            robots_txt_thread.join()  # wait until robots thread is done
        if favicon_thread:
            favicon_thread.join()  # wait until favicon thread is done
        if warcprox_controller:
            warcprox_controller.stop.set()  # send signal to shut down warc thread
        if warcprox_thread:
            warcprox_thread.join()  # wait until warcprox thread is done writing out warc

        # un-suppress logging
        logging.disable(logging.NOTSET)

    # save generated warc file
    if have_warc:
        # print "Saving WARC."
        try:
            temp_warc_path = os.path.join(warc_writer.directory,
                                          warc_writer._f_finalname)
            with open(temp_warc_path, 'rb') as warc_file:
                warc_name = default_storage.store_file(warc_file, warc_path)
                save_fields(asset, warc_capture=warc_name)

            # print "Writing CDX lines to the DB"
            CDXLine.objects.create_all_from_asset(asset)

        except Exception as e:
            logger.info("Web Archive File creation failed for %s: %s" % (target_url, e))
            save_fields(asset, warc_capture='failed')
Esempio n. 20
0
def proxy_capture(self, link_guid, target_url, base_storage_path, user_agent=''):
    """
    start warcprox process. Warcprox is a MITM proxy server and needs to be running 
    before, during and after phantomjs gets a screenshot.

    Create an image from the supplied URL, write it to disk and update our asset model with the path.
    The heavy lifting is done by PhantomJS, our headless browser.

    This whole function runs with the local dir set to a temp dir by run_in_tempdir().
    So we can use local paths for temp files, and they'll just disappear when the function exits.
    """
    # basic setup

    asset_query = get_asset_query(link_guid)
    link_query = get_link_query(link_guid)
    image_name = 'cap.png'
    warc_name = 'archive.warc.gz'
    image_path = os.path.join(base_storage_path, image_name)
    warc_path = os.path.join(base_storage_path, warc_name)

    print "%s: Fetching %s" % (link_guid, target_url)

    # create a request handler class that counts unique requests and responses
    #global unique_requests, unique_responses
    unique_requests = set()
    unique_responses = set()
    count_lock = threading.Lock()
    class CountingRequestHandler(warcprox.WarcProxyHandler):
        def _proxy_request(self):
            #global unique_requests, unique_responses
            with count_lock:
                unique_requests.add(self.url)
            warcprox.WarcProxyHandler._proxy_request(self)
            with count_lock:
                unique_responses.add(self.url)

    # connect warcprox to an open port
    warcprox_port = 27500
    recorded_url_queue = warcprox.queue.Queue()
    fake_cert_authority = warcprox.CertificateAuthority()
    for i in xrange(500):
        try:
            proxy = warcprox.WarcProxy(
                server_address=("127.0.0.1", warcprox_port),
                ca=fake_cert_authority,
                recorded_url_q=recorded_url_queue,
                req_handler_class=CountingRequestHandler
            )
            break
        except socket_error as e:
            if e.errno != errno.EADDRINUSE:
                raise
        warcprox_port += 1
    else:
        raise self.retry(exc=Exception("WarcProx couldn't find an open port."))
    proxy_address = "127.0.0.1:%s" % warcprox_port

    # start warcprox in the background
    warc_writer = warcprox.WarcWriterThread(recorded_url_q=recorded_url_queue, gzip=True, port=warcprox_port)
    warcprox_controller = warcprox.WarcproxController(proxy, warc_writer)
    warcprox_thread = threading.Thread(target=warcprox_controller.run_until_shutdown, name="warcprox", args=())
    warcprox_thread.start()

    print "WarcProx opened."

    # fetch robots.txt in the background
    def robots_txt_thread():
        print "Fetching robots.txt ..."
        parsed_url = urlparse.urlparse(target_url)
        robots_txt_location = parsed_url.scheme + '://' + parsed_url.netloc + '/robots.txt'
        try:
            robots_txt_response = requests.get(robots_txt_location,
                                               headers={'User-Agent': user_agent},
                                               proxies={parsed_url.scheme:'http://'+proxy_address},
                                               cert=fake_cert_authority.ca_file)
        except (requests.ConnectionError, requests.Timeout):
            print "Couldn't reach robots.txt"
            return
        if not robots_txt_response.ok:
            print "No robots.txt found"

        # We only want to respect robots.txt if Perma is specifically asked not to archive (we're not a crawler)
        if 'Perma' in robots_txt_response.content:
            # We found Perma specifically mentioned
            rp = robotparser.RobotFileParser()
            rp.parse([line.strip() for line in robots_txt_response.content.split('\n')])
            if not rp.can_fetch('Perma', target_url):
                link_query.update(dark_archived_robots_txt_blocked=True)
        print "Robots.txt fetched."
    robots_txt_thread = threading.Thread(target=robots_txt_thread, name="robots")
    robots_txt_thread.start()

    # fetch page in the background
    # (we'll give
    print "Fetching url."
    browser = get_browser(user_agent, proxy_address, fake_cert_authority.ca_file)
    browser.set_window_size(1024, 800)
    page_load_thread = threading.Thread(target=browser.get, args=(target_url,))  # returns after onload
    page_load_thread.start()
    page_load_thread.join(PAGE_LOAD_TIMEOUT)
    if page_load_thread.is_alive():
        print "Waited 60 seconds for onLoad event -- giving up."
        if not unique_responses:
            # if nothing at all has loaded yet, give up on the capture
            asset_query.update(warc_capture='failed', image_capture='failed')
            browser.quit()  # shut down phantomjs
            robots_txt_thread.join()  # wait until robots thread is done
            warcprox_controller.stop.set()  # send signal to shut down warc thread
            warcprox_thread.join()
            return
    print "Finished fetching url."

    # get page title
    print "Getting title."
    if browser.title:
        link_query.update(submitted_title=browser.title)

    # check meta tags
    # (run this in a thread and give it long enough to find the tags, but then let other stuff proceed)
    print "Checking meta tags."
    def meta_thread():
        # get all meta tags
        meta_tags = browser.find_elements_by_tag_name('meta')
        # first look for <meta name='perma'>
        meta_tag = next((tag for tag in meta_tags if tag.get_attribute('name').lower()=='perma'), None)
        # else look for <meta name='robots'>
        if not meta_tag:
            meta_tag = next((tag for tag in meta_tags if tag.get_attribute('name').lower() == 'robots'), None)
        # if we found a relevant meta tag, check for noarchive
        if meta_tag and 'noarchive' in meta_tag.get_attribute("content").lower():
            link_query.update(dark_archived_robots_txt_blocked=True)
            print "Meta found, darchiving"
        else:
            print "Meta not found."
    meta_thread = threading.Thread(target=meta_thread)
    meta_thread.start()
    meta_thread.join(ELEMENT_DISCOVERY_TIMEOUT*2)

    # save preliminary screenshot immediately, and an updated version later
    # (we want to return results quickly, but also give javascript time to render final results)
    print "Saving first screenshot."
    save_screenshot(browser, image_path)
    asset_query.update(image_capture=image_name)

    # make sure all requests are finished
    print "Waiting for post-load requests."
    start_time = time.time()
    time.sleep(min(AFTER_LOAD_TIMEOUT, 5))
    while len(unique_responses) < len(unique_requests):
        print "%s/%s finished" % (len(unique_responses), len(unique_requests))
        if time.time() - start_time > AFTER_LOAD_TIMEOUT:
            print "Waited %s seconds to finish post-load requests -- giving up." % AFTER_LOAD_TIMEOUT
            break
        time.sleep(.5)

    # take second screenshot after all requests done
    print "Taking second screenshot."
    save_screenshot(browser, image_path)

    # teardown:
    print "Shutting down browser and proxies."
    browser.quit()  # shut down phantomjs
    robots_txt_thread.join()  # wait until robots thread is done
    meta_thread.join()  # wait until meta thread is done
    warcprox_controller.stop.set()  # send signal to shut down warc thread
    warcprox_thread.join()  # wait until warcprox thread is done writing out warc

    print "Saving WARC."

    # save generated warc file
    try:
        temp_warc_path = os.path.join(warc_writer.directory, warc_writer._f_finalname)
        with open(temp_warc_path, 'rb') as warc_file:
            warc_name = default_storage.store_file(warc_file, warc_path)
            asset_query.update(warc_capture=warc_name)
    except Exception as e:
        logger.info("Web Archive File creation failed for %s: %s" % (target_url, e))
        asset_query.update(warc_capture='failed')

    print "%s capture done." % link_guid