Ejemplo n.º 1
0
    def obj_create(self, bundle, **kwargs):
        # We've received a request to archive a URL. That process is managed here.
        # We create a new entry in our datastore and pass the work off to our indexing
        # workers. They do their thing, updating the model as they go. When we get some minimum
        # set of results we can present the user (a guid for the link), we respond back.
        if settings.READ_ONLY_MODE:
            raise ImmediateHttpResponse(response=self.error_response(
                bundle.request, {
                    'archives': {
                        '__all__':
                        "Perma has paused archive creation for scheduled maintenance. Please try again shortly."
                    },
                    'reason':
                    "Perma has paused archive creation for scheduled maintenance. Please try again shortly.",
                }))

        # Runs validation (exception thrown if invalid), sets properties and saves the object
        if not bundle.data.get('replace'):
            bundle = super(LinkResource,
                           self).obj_create(bundle,
                                            created_by=bundle.request.user)

        link = bundle.obj
        link.save()

        # put link in folder and handle Org settings based on folder
        folder = bundle.data.get('folder')
        if folder.organization and folder.organization.default_to_private:
            link.is_private = True
            link.save()
        link.move_to_folder_for_user(
            folder, bundle.request.user)  # also sets link.organization

        uploaded_file = bundle.data.get('file')
        if uploaded_file:
            # normalize file name to upload.jpg, upload.png, upload.gif, or upload.pdf
            mime_type = get_mime_type(uploaded_file.name)
            file_name = 'upload.%s' % mime_type_lookup[mime_type][
                'new_extension']

            base_warc_url = "file:///%s/%s" % (link.guid, file_name)

            # only append a random number to warc_url if we're replacing a file
            warc_url = base_warc_url if not bundle.data.get(
                'replace') else "%s?version=%s" % (
                    base_warc_url, str(random.random()).replace('.', ''))

            capture = Capture(link=link,
                              role='primary',
                              status='success',
                              record_type='resource',
                              user_upload='True',
                              content_type=mime_type,
                              url=warc_url)

            uploaded_file.file.seek(0)
            capture.write_warc_resource_record(uploaded_file)
            capture.save()

        else:
            # create primary capture placeholder
            Capture(
                link=link,
                role='primary',
                status='pending',
                record_type='response',
                url=link.submitted_url,
            ).save()

            # create screenshot placeholder
            Capture(
                link=link,
                role='screenshot',
                status='pending',
                record_type='resource',
                url="file:///%s/cap.png" % link.guid,
                content_type='image/png',
            ).save()

            # create CaptureJob
            CaptureJob(link=link, human=bundle.data.get('human', False)).save()

            # kick off capture tasks -- no need for guid since it'll work through the queue
            run_task(run_next_capture.s())

        return bundle
Ejemplo n.º 2
0
    def post(self, request, format=None):
        """ Create new link. """
        data = request.data
        capture_job = CaptureJob(human=request.data.get('human', False),
                                 submitted_url=request.data.get('url', ''),
                                 created_by=request.user)
        if settings.ENABLE_BATCH_LINKS:
            # Batch is set directly on the request object by the LinkBatch api,
            # to prevent abuse of this feature by those POSTing directly to this route.
            if getattr(request, 'batch', None):
                capture_job.link_batch = LinkBatch.objects.get(
                    id=request.batch)
        capture_job.save()

        # Set target folder, in order of preference:
        # - 'folder' key in data
        # - parent folder, if posting to /folders/:parent_id/archives
        # - user's personal folder
        try:
            folder = self.get_folder_from_request(
                request) or request.parent or request.user.root_folder
        except ValidationError as e:
            raise_invalid_capture_job(capture_job, e.detail)

        # Make sure a limited user has links left to create
        if not folder.organization:
            if not request.user.link_creation_allowed():
                if request.user.nonpaying:
                    raise_invalid_capture_job(
                        capture_job, "You've already reached your limit.")
                error = "Perma.cc cannot presently make additional Perma Links on your behalf. Visit your subscription settings page for more information."
                raise_invalid_capture_job(capture_job, error)
        else:
            registrar = folder.organization.registrar
            if not registrar.link_creation_allowed():
                error = 'Perma.cc cannot presently make links on behalf of {}. '.format(
                    registrar.name)
                if request.user.registrar:
                    contact = 'Visit your settings for subscription information.'
                else:
                    registrar_users = [
                        user.email
                        for user in registrar.active_registrar_users()
                    ]
                    contact = 'For assistance with your subscription, contact:  {}.'.format(
                        ", ".join(registrar_users))
                raise_invalid_capture_job(capture_job, error + contact)

        serializer = self.serializer_class(data=data,
                                           context={'request': request})
        if serializer.is_valid():

            link = serializer.save(created_by=request.user)

            # put link in folder and handle Org settings based on folder
            if folder.organization and folder.organization.default_to_private:
                link.is_private = True
                link.save()
            link.move_to_folder_for_user(
                folder, request.user)  # also sets link.organization

            # handle uploaded file
            uploaded_file = request.data.get('file')
            if uploaded_file:
                link.write_uploaded_file(uploaded_file)
                link.warc_size = default_storage.size(link.warc_storage_file())
                link.save()

            # handle submitted url
            else:
                # create primary capture placeholder
                Capture(
                    link=link,
                    role='primary',
                    status='pending',
                    record_type='response',
                    url=link.submitted_url,
                ).save()

                # create screenshot placeholder
                Capture(
                    link=link,
                    role='screenshot',
                    status='pending',
                    record_type='resource',
                    url="file:///%s/cap.png" % link.guid,
                    content_type='image/png',
                ).save()

                # kick off capture tasks -- no need for guid since it'll work through the queue
                capture_job.status = 'pending'
                capture_job.link = link
                capture_job.save(update_fields=['status', 'link'])
                run_task(run_next_capture.s())

            return Response(serializer.data, status=status.HTTP_201_CREATED)

        raise_invalid_capture_job(capture_job, serializer.errors)
Ejemplo n.º 3
0
    def post(self, request, format=None):
        """ Create new link. """
        data = request.data
        capture_job = CaptureJob(
            human=request.data.get('human', False),
            submitted_url=request.data.get('url', ''),
            created_by=request.user
        )
        if settings.ENABLE_BATCH_LINKS:
            # Batch is set directly on the request object by the LinkBatch api,
            # to prevent abuse of this feature by those POSTing directly to this route.
            if getattr(request, 'batch', None):
                capture_job.link_batch = LinkBatch.objects.get(id=request.batch)
        capture_job.save()


        # Set target folder, in order of preference:
        # - 'folder' key in data
        # - parent folder, if posting to /folders/:parent_id/archives
        # - user's personal folder
        try:
            folder = self.get_folder_from_request(request) or request.parent or request.user.root_folder
        except ValidationError as e:
            raise_invalid_capture_job(capture_job, e.detail)

        # Disallow creation of links in top-level sponsored folder
        if folder.is_sponsored_root_folder:
            error = "You can't make links directly in your Sponsored Links folder. Select a folder belonging to a sponsor."
            raise_invalid_capture_job(capture_job, error)

        # Make sure a limited user has links left to create
        if not folder.organization and not folder.sponsored_by:
            if not request.user.link_creation_allowed():
                if request.user.nonpaying:
                    raise_invalid_capture_job(capture_job, "You've already reached your limit.")
                error = "Perma.cc cannot presently make additional Perma Links on your behalf. Visit your subscription settings page for more information."
                raise_invalid_capture_job(capture_job, error)
        else:
            registrar = folder.sponsored_by if folder.sponsored_by else folder.organization.registrar

            msg = None
            if folder.read_only:
                registrar_users = [user.email for user in registrar.active_registrar_users()]
                msg = f"Your registrar has made this folder read-only. For assistance, contact: {', '.join(registrar_users)}."
            if not registrar.link_creation_allowed():
                error = 'Perma.cc cannot presently make links on behalf of {}. '.format(registrar.name)
                if request.user.registrar:
                    contact = 'Visit your settings for subscription information.'
                else:
                    registrar_users = [user.email for user in registrar.active_registrar_users()]
                    contact = 'For assistance with your subscription, contact:  {}.'.format(", ".join(registrar_users))
                msg = error + contact
            if msg:
                raise_invalid_capture_job(capture_job, msg)

        serializer = self.serializer_class(data=data, context={'request': request})
        if serializer.is_valid():

            with transaction.atomic():
                # Technique from https://github.com/harvard-lil/capstone/blob/0f7fb80f26e753e36e0c7a6a199b8fdccdd318be/capstone/capapi/serializers.py#L121
                #
                # Fetch the current user data here inside a transaction, using select_for_update
                # to lock the row so we don't collide with any simultaneous requests
                user = request.user.__class__.objects.select_for_update().get(pk=request.user.pk)

                # If this is a Personal Link, and if the user only has bonus links left, decrement bonus links
                bonus_link = False
                if not folder.organization and not folder.sponsored_by:
                    links_remaining, _ , bonus_links = user.get_links_remaining()
                    if bonus_links and not links_remaining:
                        # (this works because it's part of the same transaction with the select_for_update --
                        # we don't have to use the same object)
                        request.user.bonus_links = bonus_links - 1
                        request.user.save(update_fields=['bonus_links'])
                        bonus_link = True

                link = serializer.save(created_by=request.user, bonus_link=bonus_link)

            # put link in folder and handle Org settings based on folder
            if folder.organization and folder.organization.default_to_private:
                link.is_private = True
                link.save()
            link.move_to_folder_for_user(folder, request.user)  # also sets link.organization

            # handle uploaded file
            uploaded_file = request.data.get('file')
            if uploaded_file:
                link.write_uploaded_file(uploaded_file)

            # handle submitted url
            else:
                # create primary capture placeholder
                Capture(
                    link=link,
                    role='primary',
                    status='pending',
                    record_type='response',
                    url=link.submitted_url,
                ).save()

                # create screenshot placeholder
                Capture(
                    link=link,
                    role='screenshot',
                    status='pending',
                    record_type='resource',
                    url="file:///%s/cap.png" % link.guid,
                    content_type='image/png',
                ).save()


                # kick off capture tasks -- no need for guid since it'll work through the queue
                capture_job.status = 'pending'
                capture_job.link = link
                capture_job.save(update_fields=['status', 'link'])
                run_task(run_next_capture.s())

            return Response(serializer.data, status=status.HTTP_201_CREATED)

        raise_invalid_capture_job(capture_job, serializer.errors)
Ejemplo n.º 4
0
    def post(self, request, format=None):
        """ Create new link. """
        data = request.data

        # Set target folder, in order of preference:
        # - 'folder' key in data
        # - parent folder, if posting to /folders/:parent_id/archives
        # - user's personal folder
        folder = self.get_folder_from_request(
            request) or request.parent or request.user.root_folder

        # Make sure a limited user has links left to create
        if not folder.organization:
            links_remaining = request.user.get_links_remaining()
            if links_remaining < 1:
                raise_validation_error("You've already reached your limit.")
        else:
            registrar = folder.organization.registrar
            if not registrar.link_creation_allowed():
                error = 'Perma.cc cannot presently make links on behalf of {}. '.format(
                    registrar.name)
                if request.user.registrar:
                    contact = 'Visit your settings for subscription information.'
                else:
                    registrar_users = [
                        user.email
                        for user in registrar.active_registrar_users()
                    ]
                    contact = 'For assistance with your subscription, contact:  {}.'.format(
                        ", ".join(registrar_users))
                raise_validation_error(error + contact)

        serializer = self.serializer_class(data=data,
                                           context={'request': request})
        if serializer.is_valid():

            link = serializer.save(created_by=request.user)

            # put link in folder and handle Org settings based on folder
            if folder.organization and folder.organization.default_to_private:
                link.is_private = True
                link.save()
            link.move_to_folder_for_user(
                folder, request.user)  # also sets link.organization

            # handle uploaded file
            uploaded_file = request.data.get('file')
            if uploaded_file:
                link.write_uploaded_file(uploaded_file)

            # handle submitted url
            else:
                # create primary capture placeholder
                Capture(
                    link=link,
                    role='primary',
                    status='pending',
                    record_type='response',
                    url=link.submitted_url,
                ).save()

                # create screenshot placeholder
                Capture(
                    link=link,
                    role='screenshot',
                    status='pending',
                    record_type='resource',
                    url="file:///%s/cap.png" % link.guid,
                    content_type='image/png',
                ).save()

                # create CaptureJob
                CaptureJob(link=link, human=request.data.get('human',
                                                             False)).save()

                # kick off capture tasks -- no need for guid since it'll work through the queue
                run_task(run_next_capture.s())

            return Response(serializer.data, status=status.HTTP_201_CREATED)
        return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
Ejemplo n.º 5
0
def proxy_capture(capture_job):
    """
    Start warcprox process. Warcprox is a MITM proxy server and needs to be running
    before, during and after the headless browser.

    Start a headless browser to capture the supplied URL. Also take a screenshot if the URL is an HTML file.

    This whole function runs with the local dir set to a temp dir by run_in_tempdir().
    So we can use local paths for temp files, and they'll just disappear when the function exits.
    """

    # basic setup
    link = capture_job.link
    target_url = link.safe_url

    if link.user_deleted or link.primary_capture.status != "pending":
        capture_job.mark_completed('deleted')
        return

    capture_job.attempt += 1
    capture_job.save()

    # Helper function to update capture_job's progress
    def display_progress(step_count, step_description):
        save_fields(capture_job,
                    step_count=step_count,
                    step_description=step_description)
        print "%s step %s: %s" % (link.guid, step_count, step_description)

    print "%s: Fetching %s" % (link.guid, target_url)
    progress = 0
    display_progress(progress, "Starting capture")

    # suppress verbose warcprox logs
    logging.disable(logging.INFO)

    # Set up an exception we can trigger to halt capture and release all the resources involved.
    class HaltCaptureException(Exception):
        pass

    browser = warcprox_controller = warcprox_thread = display = have_html = None
    have_warc = False
    thread_list = []
    successful_favicon_urls = []

    try:

        # create a request handler class that counts requests and responses
        proxied_requests = []
        proxied_responses = []
        proxied_pairs = []
        count_lock = threading.Lock()

        class CountingRequestHandler(WarcProxyHandler):
            def _proxy_request(self):

                # make sure we don't capture anything in a banned IP range
                if not url_in_allowed_ip_range(self.url):
                    return

                with count_lock:
                    proxied_pair = [self.url, None]
                    proxied_requests.append(proxied_pair[0])
                    proxied_pairs.append(proxied_pair)
                response = WarcProxyHandler._proxy_request(self)
                with count_lock:
                    proxied_responses.append(response)
                    proxied_pair[1] = response

        # connect warcprox to an open port
        warcprox_port = 27500
        recorded_url_queue = queue.Queue()
        for i in xrange(500):
            try:
                proxy = WarcProxy(server_address=("127.0.0.1", warcprox_port),
                                  recorded_url_q=recorded_url_queue,
                                  req_handler_class=CountingRequestHandler)
                break
            except socket_error as e:
                if e.errno != errno.EADDRINUSE:
                    raise
            warcprox_port += 1
        else:
            raise Exception("WarcProx couldn't find an open port.")
        proxy_address = "127.0.0.1:%s" % warcprox_port

        # start warcprox in the background
        warc_writer = WarcWriter(gzip=True, port=warcprox_port)
        warc_writer_thread = WarcWriterThread(
            recorded_url_q=recorded_url_queue, warc_writer=warc_writer)
        warcprox_controller = WarcproxController(proxy, warc_writer_thread)
        warcprox_thread = threading.Thread(
            target=warcprox_controller.run_until_shutdown,
            name="warcprox",
            args=())
        warcprox_thread.start()

        print "WarcProx opened."

        # Helper function to get a url, via proxied requests.get(), in a way that is interruptable from other threads.
        # This should only be run from sub-threads.
        def get_url(url):
            request_thread = add_thread(
                thread_list, ProxiedRequestThread(proxy_address, url))
            request_thread.join()
            return request_thread.response, request_thread.response_exception

        # start virtual display
        if settings.CAPTURE_BROWSER != "PhantomJS":
            display = Display(visible=0, size=(1024, 800))
            display.start()

        # fetch page in the background
        progress += 1
        display_progress(progress, "Fetching target URL")
        browser = get_browser(settings.CAPTURE_USER_AGENT, proxy_address,
                              proxy.ca.ca_file)
        browser.set_window_size(1024, 800)

        start_time = time.time()
        page_load_thread = threading.Thread(
            target=browser.get, args=(target_url, ))  # returns after onload
        page_load_thread.start()
        page_load_thread.join(ONLOAD_EVENT_TIMEOUT)

        # wait until warcprox records a response that isn't a forward
        have_response = False
        while not have_response:
            if proxied_responses:
                for request, response in proxied_pairs:
                    if response is None:
                        # Response hasn't finished yet -- we might get here because subsequent
                        # responses have finished, but we have to go in order to find the correct content_type,
                        # so let's wait for this one.
                        break
                    if response.url.endswith(
                            '/favicon.ico') and response.url != target_url:
                        continue
                    if not hasattr(response, 'parsed_response'):
                        response.parsed_response = parse_response(
                            response.response_recorder.headers)
                    if response.parsed_response.is_redirect or response.parsed_response.status_code == 206:  # partial content
                        continue

                    content_url = response.url
                    content_type = response.parsed_response.headers.get(
                        'content-type')
                    robots_directives = response.parsed_response.headers.get(
                        'x-robots-tag')
                    have_html = content_type and content_type.startswith(
                        'text/html')
                    have_response = True
                    break

            if have_response:
                have_warc = True  # at this point we have something that's worth showing to the user
                break

            wait_time = time.time() - start_time
            if wait_time > RESOURCE_LOAD_TIMEOUT:
                raise HaltCaptureException

            progress = int(progress) + wait_time / RESOURCE_LOAD_TIMEOUT
            display_progress(progress, "Fetching target URL")

            time.sleep(1)

        print "Finished fetching url."

        # check for x-robots-tag directives
        progress = int(progress) + 1
        display_progress(progress, "Checking x-robots-tag directives.")
        if robots_directives:
            darchive = False
            for directive in robots_directives.split(";"):
                parsed = directive.lower().split(":")
                # respect tags that target all crawlers (no user-agent specified)
                if len(parsed) == 1:
                    if "noarchive" in parsed:
                        darchive = True
                # look for perma user-agent
                elif len(parsed) == 2:
                    if parsed[0] == "perma" and "noarchive" in parsed[1]:
                        darchive = True
                # if the directive is poorly formed, do our best
                else:
                    if "perma" in directive and "noarchive" in directive:
                        darchive = True

            if darchive:
                save_fields(link, is_private=True, private_reason='policy')
                print "x-robots-tag found, darchiving"

        # get favicon urls
        # Here we fetch everything in the page that's marked as a favicon, for archival purposes.
        # But we only record a favicon as our favicon_capture_url if it passes a mimetype whitelist.
        def favicon_thread():
            favicon_urls = []
            if have_html and browser_still_running(browser):
                favicons = repeat_while_exception(
                    lambda: browser.find_elements_by_css_selector(
                        'link[rel="shortcut icon"],link[rel="icon"]'),
                    timeout=10)
                for candidate_favicon in favicons:
                    if candidate_favicon.get_attribute('href'):
                        candidate_favicon_url = urlparse.urljoin(
                            content_url,
                            candidate_favicon.get_attribute('href'))
                        favicon_urls.append(candidate_favicon_url)
            favicon_urls.append(urlparse.urljoin(content_url, '/favicon.ico'))
            if not favicon_urls:
                return

            for favicon_url in favicon_urls:
                print "Fetching favicon from %s ..." % favicon_url
                favicon_response, e = get_url(favicon_url)
                if e or not favicon_response or not favicon_response.ok:
                    print "Favicon failed:", e, favicon_response
                    continue

                # apply mime type whitelist
                mime_type = favicon_response.headers.get('content-type',
                                                         '').split(';')[0]
                if mime_type not in VALID_FAVICON_MIME_TYPES:
                    continue

                successful_favicon_urls.append((favicon_url, mime_type))

            if not successful_favicon_urls:
                print "Couldn't get favicon"

        add_thread(thread_list, favicon_thread)

        # fetch robots.txt in the background
        def robots_txt_thread():
            print "Fetching robots.txt ..."
            robots_txt_location = urlparse.urljoin(content_url, '/robots.txt')
            robots_txt_response, e = get_url(robots_txt_location)
            if e or not robots_txt_response or not robots_txt_response.ok:
                print "Couldn't reach robots.txt"
                return

            # We only want to respect robots.txt if Perma is specifically asked not to archive (we're not a crawler)
            if 'Perma' in robots_txt_response.content:
                # We found Perma specifically mentioned
                rp = robotparser.RobotFileParser()
                rp.parse([
                    line.strip()
                    for line in robots_txt_response.content.split('\n')
                ])
                if not rp.can_fetch('Perma', target_url):
                    save_fields(link, is_private=True, private_reason='policy')
                    print "Robots.txt fetched."

        add_thread(thread_list, robots_txt_thread)

        if have_html:

            # check meta tags
            print "Checking meta tags."

            def meta_analysis_failed():
                if settings.PRIVATE_LINKS_ON_FAILURE:
                    save_fields(link,
                                is_private=True,
                                private_reason='failure')
                link.tags.add('meta-tag-retrieval-failure')
                print "Meta tag retrieval failure."

            if browser_still_running(browser):

                def meta_thread():
                    def js_get_tags():
                        return browser.execute_script("""
                            var meta_tags = document.getElementsByTagName('meta');
                            var tags = [];
                            for (var i = 0; i < meta_tags.length; i++){
                                tags.push({"name":meta_tags[i].name, "content":meta_tags[i].content});
                            }
                            return tags
                        """)

                    # get all meta tags
                    meta_tags = repeat_while_exception(
                        lambda: browser.find_elements_by_tag_name('meta'),
                        timeout=30)

                    # if that retrieves even one meta tag, we need to succeed at parsing
                    # them before we can confidently make a link public
                    if meta_tags:
                        meta_list = None
                        try:
                            # assemble required attributes for processing.
                            # this sometimes fails because javascript alters the DOM sufficiently
                            # that the elements found above are no longer available by the time
                            # we wish to iterate through them. in that case, a StaleElementReferenceException
                            # is thrown.
                            # http://www.seleniumhq.org/exceptions/stale_element_reference.jsp
                            # but, since this is important to get right, catch all exceptions,
                            # not just StaleElementReferenceException. If an exception is thrown, we want
                            # to try the javascript method regardless.
                            meta_list = [{
                                "name":
                                tag.get_attribute('name'),
                                "content":
                                tag.get_attribute("content")
                            } for tag in meta_tags]
                        except:
                            meta_list = repeat_until_truthy(js_get_tags,
                                                            sleep_time=1)
                            if not meta_list:
                                # if still no meta tags at all, then this process has failed.
                                meta_analysis_failed()

                        # first look for <meta name='perma'>
                        meta_tag = next((tag for tag in meta_list
                                         if tag['name'].lower() == 'perma'),
                                        None)
                        # else look for <meta name='robots'>
                        if not meta_tag:
                            meta_tag = next(
                                (tag for tag in meta_list
                                 if tag['name'].lower() == 'robots'), None)
                        # if we found a relevant meta tag, check for noarchive
                        if meta_tag and 'noarchive' in meta_tag[
                                "content"].lower():
                            save_fields(link,
                                        is_private=True,
                                        private_reason='policy')
                            print "Meta found, darchiving"

                        description_meta_tag = next(
                            (tag for tag in meta_list
                             if tag['name'].lower() == 'description'), '')
                        if description_meta_tag and description_meta_tag[
                                'content']:
                            save_fields(
                                link,
                                submitted_description=description_meta_tag[
                                    'content'])

                add_thread(thread_list, meta_thread)
            else:
                meta_analysis_failed()

            # Skip all these things if the browser has died.
            if browser_still_running(browser):
                # get page title
                progress = int(progress) + 1
                display_progress(progress, "Getting page title")

                def get_title():
                    if browser.title:
                        save_fields(link, submitted_title=browser.title)
                    else:
                        title_element = browser.find_element_by_tag_name(
                            "title")
                        save_fields(
                            link,
                            submitted_title=title_element.get_attribute(
                                "text"))

                repeat_while_exception(get_title,
                                       timeout=10,
                                       raise_after_timeout=False)

                # scroll to bottom of page, in case that prompts anything else to load
                # TODO: This doesn't scroll horizontally or scroll frames
                progress += .5
                display_progress(progress, "Checking for scroll-loaded assets")

                def scroll_browser():
                    try:
                        scroll_delay = browser.execute_script("""
                            // Scroll down the page in a series of jumps the size of the window height.
                            // The actual scrolling is done in a setTimeout with a 50ms delay so the browser has
                            // time to render at each position.
                            var delay=50,
                                height=document.body.scrollHeight,
                                jump=window.innerHeight,
                                scrollTo=function(scrollY){ window.scrollTo(0, scrollY) },
                                i=1;
                            for(;i*jump<height;i++){
                                setTimeout(scrollTo, i*delay, i*jump);
                            }

                            // Scroll back to top before taking screenshot.
                            setTimeout(scrollTo, i*delay, 0);

                            // Return how long all this scrolling will take.
                            return (i*delay)/1000;
                        """)

                        # In python, wait for javascript background scrolling to finish.
                        time.sleep(min(scroll_delay, 1))
                    except (WebDriverException, URLError):
                        # Don't panic if we can't scroll -- we've already captured something useful anyway.
                        # WebDriverException: the page can't execute JS for some reason.
                        # URLError: the headless browser has gone away for some reason.
                        pass

                repeat_while_exception(scroll_browser)

                # load media
                progress = int(progress) + 1
                display_progress(progress, "Fetching media")
                with warn_on_exception("Error fetching media"):
                    # running in each frame ...
                    def get_media_tags(browser):
                        url_set = []
                        base_url = browser.current_url

                        def make_absolute_urls(urls):
                            '''collect resource urls, converted to absolute urls relative to current browser frame'''
                            return [
                                urlparse.urljoin(base_url, url) for url in urls
                                if url
                            ]

                        # get all images in srcsets
                        print("Fetching images in srcsets")
                        for img in browser.find_elements_by_css_selector(
                                'img[srcset], source[srcset]'):
                            urls = [
                                src.strip().split(' ')[0] for src in
                                img.get_attribute('srcset').split(',')
                            ]
                            url_set.extend(make_absolute_urls(urls))

                        # fetch each audio/video/object/embed element
                        if settings.ENABLE_AV_CAPTURE:
                            print("Fetching audio/video objects")
                            media_tags = sum(
                                (browser.find_elements_by_tag_name(tag_name)
                                 for tag_name in ('video', 'audio', 'object',
                                                  'embed')), [])
                            for tag in media_tags:
                                # for each tag, extract all resource urls
                                if tag.tag_name == 'object':
                                    # for <object>, get the data and archive attributes, prepended with codebase attribute if it exists,
                                    # as well as any <param name="movie" value="url"> elements
                                    codebase_url = tag.get_attribute(
                                        'codebase') or base_url
                                    urls = [
                                        urlparse.urljoin(codebase_url, url) for
                                        url in [tag.get_attribute('data')] +
                                        (tag.get_attribute('archive') or ''
                                         ).split()
                                    ] + [
                                        param.get_attribute('value') for param
                                        in tag.find_elements_by_css_selector(
                                            'param[name="movie"]')
                                    ]
                                else:
                                    # for <audio>, <video>, and <embed>, get src attribute and any <source src="url"> elements
                                    urls = [tag.get_attribute('src')] + [
                                        source.get_attribute('src')
                                        for source in
                                        tag.find_elements_by_tag_name('source')
                                    ]

                                url_set.extend(make_absolute_urls(urls))

                        return url_set

                    media_urls = run_in_frames(browser, get_media_tags)

                    # grab all media urls that aren't already being grabbed
                    for media_url in set(media_urls) - set(proxied_requests):
                        add_thread(
                            thread_list,
                            ProxiedRequestThread(proxy_address, media_url))

        # Wait AFTER_LOAD_TIMEOUT seconds for any requests to finish that are started within the next .5 seconds.
        progress = int(progress) + 1
        display_progress(progress, "Waiting for post-load requests")
        time.sleep(.5)
        unfinished_proxied_pairs = [
            pair for pair in proxied_pairs if not pair[1]
        ]
        start_time = time.time()
        while unfinished_proxied_pairs and browser_still_running(browser):

            print "Waiting for %s pending requests" % len(
                unfinished_proxied_pairs)

            # give up after AFTER_LOAD_TIMEOUT seconds
            wait_time = time.time() - start_time
            if wait_time > AFTER_LOAD_TIMEOUT:
                print "Waited %s seconds to finish post-load requests -- giving up." % AFTER_LOAD_TIMEOUT
                break

            # Give up if downloaded size exceeds MAX_ARCHIVE_FILE_SIZE.
            # Amount captured so far is the sum of the bytes recorded by warcprox,
            # and the bytes pending in our background threads.
            capture_size = sum(getattr(thread, 'pending_data', 0) for thread in thread_list) + \
                           sum(proxied_response.response_recorder.len for proxied_response in proxied_responses)
            if capture_size > settings.MAX_ARCHIVE_FILE_SIZE:
                print "Halting for size"
                break

            # Show progress to user
            progress = int(progress) + wait_time / AFTER_LOAD_TIMEOUT
            display_progress(progress, "Waiting for post-load requests")

            # Sleep and update our list
            time.sleep(.5)
            unfinished_proxied_pairs = [
                pair for pair in unfinished_proxied_pairs if not pair[1]
            ]

        # screenshot capture
        if have_html and browser_still_running(browser):
            # get page size to decide whether to take a screenshot
            capture_screenshot = False
            pixel_count = 0
            try:
                root_element = browser.find_element_by_tag_name('body')
            except (NoSuchElementException, URLError):
                try:
                    root_element = browser.find_element_by_tag_name('frameset')
                except (NoSuchElementException, URLError):
                    # If we can't find the root element, we just won't capture a screenshot.
                    # NoSuchElementException: HTML structure is weird somehow.
                    # URLError: the headless browser has gone away for some reason.
                    root_element = None
            if root_element:
                page_size = root_element.size
                pixel_count = page_size['width'] * page_size['height']
                capture_screenshot = pixel_count < settings.MAX_IMAGE_SIZE

            # take screenshot after all requests done
            if capture_screenshot:
                progress += 1
                display_progress(progress, "Taking screenshot")
                screenshot_data = browser.get_screenshot_as_png()
                link.screenshot_capture.write_warc_resource_record(
                    screenshot_data)
                save_fields(link.screenshot_capture, status='success')
            else:
                print "Not saving screenshots! %s" % (
                    "Page size is %s pixels." %
                    pixel_count if pixel_count else "")
                save_fields(link.screenshot_capture, status='failed')
        else:
            # no screenshot if not HTML
            save_fields(link.screenshot_capture, status='failed')

    except HaltCaptureException:
        pass

    finally:
        # teardown (have to do this before save to make sure WARC is done writing):
        print "Shutting down browser and proxies."

        for thread in thread_list:
            # wait until threads are done (have to do this before closing phantomjs)
            if hasattr(thread, 'stop'):
                thread.stop.set()
            thread.join()
        if browser:
            browser.quit()  # shut down phantomjs
        if display:
            display.stop()  # shut down virtual display
        if warcprox_controller:
            warcprox_controller.stop.set(
            )  # send signal to shut down warc thread
        if warcprox_thread:
            warcprox_thread.join(
            )  # wait until warcprox thread is done writing out warc

        # un-suppress logging
        logging.disable(logging.NOTSET)

    # save generated warc file
    if have_warc:
        progress += 1
        display_progress(progress, "Saving web archive file")

        temp_warc_path = os.path.join(warc_writer.directory,
                                      warc_writer._f_finalname)
        with open(temp_warc_path, 'rb') as warc_file:
            link.write_warc_raw_data(warc_file)

        print "Writing CDX lines to the DB"
        CDXLine.objects.create_all_from_link(link)

        save_fields(
            link.primary_capture,
            status='success',
            content_type=content_type,
        )
        save_fields(link,
                    warc_size=default_storage.size(link.warc_storage_file()))
        capture_job.mark_completed()

        # We only save the Capture for the favicon once the warc is stored,
        # since the data for the favicon lives in the warc.
        if successful_favicon_urls:
            Capture(link=link,
                    role='favicon',
                    status='success',
                    record_type='response',
                    url=successful_favicon_urls[0][0],
                    content_type=successful_favicon_urls[0][1]).save()
            print "Saved favicon at %s" % successful_favicon_urls

        print "%s capture succeeded." % link.guid
Ejemplo n.º 6
0
def proxy_capture(self, link_guid, user_agent=''):
    """
    start warcprox process. Warcprox is a MITM proxy server and needs to be running
    before, during and after phantomjs gets a screenshot.

    Create an image from the supplied URL, write it to disk and update our asset model with the path.
    The heavy lifting is done by PhantomJS, our headless browser.

    This whole function runs with the local dir set to a temp dir by run_in_tempdir().
    So we can use local paths for temp files, and they'll just disappear when the function exits.
    """
    # basic setup
    link = Link.objects.get(guid=link_guid)
    target_url = link.safe_url

    # allow pending tasks to be canceled outside celery by updating capture status
    if link.primary_capture.status != "pending":
        return

    # Override user_agent for now, since PhantomJS doesn't like some user agents.
    # This user agent is the Chrome on Linux that's most like PhantomJS 2.1.1.
    user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.37 Safari/537.36"

    print "%s: Fetching %s" % (link_guid, target_url)

    # suppress verbose warcprox logs
    logging.disable(logging.INFO)

    # Set up an exception we can trigger to halt capture and release all the resources involved.
    class HaltCaptureException(Exception):
        pass

    browser = warcprox_controller = warcprox_thread = display = have_html = None
    have_warc = False
    thread_list = []
    successful_favicon_urls = []

    try:

        # create a request handler class that counts requests and responses
        proxied_requests = []
        proxied_responses = []
        count_lock = threading.Lock()

        class CountingRequestHandler(WarcProxyHandler):
            def _proxy_request(self):
                with count_lock:
                    proxied_requests.append(self.url)
                response = WarcProxyHandler._proxy_request(self)
                with count_lock:
                    proxied_responses.append(response)

        # connect warcprox to an open port
        warcprox_port = 27500
        recorded_url_queue = queue.Queue()
        for i in xrange(500):
            try:
                proxy = WarcProxy(server_address=("127.0.0.1", warcprox_port),
                                  recorded_url_q=recorded_url_queue,
                                  req_handler_class=CountingRequestHandler)
                break
            except socket_error as e:
                if e.errno != errno.EADDRINUSE:
                    raise
            warcprox_port += 1
        else:
            raise self.retry(
                exc=Exception("WarcProx couldn't find an open port."))
        proxy_address = "127.0.0.1:%s" % warcprox_port

        # set up requests getter for one-off requests outside of selenium
        def proxied_get_request(url):
            return requests.get(url,
                                headers={'User-Agent': user_agent},
                                proxies={
                                    'http': 'http://' + proxy_address,
                                    'https': 'http://' + proxy_address
                                },
                                verify=False)

        # start warcprox in the background
        warc_writer = WarcWriter(gzip=True, port=warcprox_port)
        warc_writer_thread = WarcWriterThread(
            recorded_url_q=recorded_url_queue, warc_writer=warc_writer)
        warcprox_controller = WarcproxController(proxy, warc_writer_thread)
        warcprox_thread = threading.Thread(
            target=warcprox_controller.run_until_shutdown,
            name="warcprox",
            args=())
        warcprox_thread.start()

        print "WarcProx opened."

        # start virtual display
        if settings.CAPTURE_BROWSER != "PhantomJS":
            display = Display(visible=0, size=(1024, 800))
            display.start()

        # fetch page in the background
        print "Fetching url."
        browser = get_browser(user_agent, proxy_address, proxy.ca.ca_file)
        browser.set_window_size(1024, 800)

        start_time = time.time()
        page_load_thread = threading.Thread(
            target=browser.get, args=(target_url, ))  # returns after onload
        page_load_thread.start()
        page_load_thread.join(ONLOAD_EVENT_TIMEOUT)

        # wait until warcprox records a response that isn't a forward
        have_response = False
        while not have_response:
            if proxied_responses:
                for response in proxied_responses:
                    if response.url.endswith(
                            '/favicon.ico') and response.url != target_url:
                        continue
                    if not hasattr(response, 'parsed_response'):
                        response.parsed_response = parse_response(
                            response.response_recorder.headers)
                    if response.parsed_response.is_redirect or response.parsed_response.status_code == 206:  # partial content
                        continue

                    content_url = response.url
                    content_type = response.parsed_response.headers.get(
                        'content-type')
                    have_html = content_type and content_type.startswith(
                        'text/html')
                    have_response = True
                    break

            if time.time() - start_time > RESOURCE_LOAD_TIMEOUT:
                raise HaltCaptureException
            time.sleep(1)

        print "Finished fetching url."

        # get favicon urls
        # Here we fetch everything in the page that's marked as a favicon, for archival purposes.
        # But we only record a favicon as our favicon_capture_url if it passes a mimetype whitelist.
        def favicon_thread():
            favicon_urls = []
            if have_html:
                favicons = repeat_while_exception(
                    lambda: browser.find_elements_by_css_selector(
                        'link[rel="shortcut icon"],link[rel="icon"]'),
                    timeout=10)
                for candidate_favicon in favicons:
                    if candidate_favicon.get_attribute('href'):
                        candidate_favicon_url = urlparse.urljoin(
                            content_url,
                            candidate_favicon.get_attribute('href'))
                        favicon_urls.append(candidate_favicon_url)
            favicon_urls.append(urlparse.urljoin(content_url, '/favicon.ico'))
            if not favicon_urls:
                return

            for favicon_url in favicon_urls:
                print "Fetching favicon from %s ..." % favicon_url
                try:
                    favicon_response = proxied_get_request(favicon_url)
                    assert favicon_response.ok
                except (requests.ConnectionError, requests.Timeout,
                        AssertionError) as e:
                    print "Failed:", e
                    continue

                # apply mime type whitelist
                mime_type = favicon_response.headers.get('content-type',
                                                         '').split(';')[0]
                if not mime_type in VALID_FAVICON_MIME_TYPES:
                    continue

                successful_favicon_urls.append((favicon_url, mime_type))

            if not successful_favicon_urls:
                print "Couldn't get favicon"

        add_thread(thread_list, favicon_thread)

        # fetch robots.txt in the background
        def robots_txt_thread():
            print "Fetching robots.txt ..."
            robots_txt_location = urlparse.urljoin(content_url, '/robots.txt')
            try:
                robots_txt_response = proxied_get_request(robots_txt_location)
                assert robots_txt_response.ok
            except (requests.ConnectionError, requests.Timeout,
                    AssertionError):
                print "Couldn't reach robots.txt"
                return

            # We only want to respect robots.txt if Perma is specifically asked not to archive (we're not a crawler)
            if 'Perma' in robots_txt_response.content:
                # We found Perma specifically mentioned
                rp = robotparser.RobotFileParser()
                rp.parse([
                    line.strip()
                    for line in robots_txt_response.content.split('\n')
                ])
                if not rp.can_fetch('Perma', target_url):
                    save_fields(link, is_private=True, private_reason='policy')
                    print "Robots.txt fetched."

        add_thread(thread_list, robots_txt_thread)

        if have_html:
            # get page title
            print "Getting title."

            def get_title():
                if browser.title:
                    save_fields(link, submitted_title=browser.title)

            repeat_while_exception(get_title, timeout=10)

            # check meta tags
            print "Checking meta tags."

            def meta_thread():
                # get all meta tags
                meta_tags = repeat_while_exception(
                    lambda: browser.find_elements_by_tag_name('meta'),
                    timeout=10)
                # first look for <meta name='perma'>
                meta_tag = next(
                    (tag for tag in meta_tags
                     if tag.get_attribute('name').lower() == 'perma'), None)
                # else look for <meta name='robots'>
                if not meta_tag:
                    meta_tag = next(
                        (tag for tag in meta_tags
                         if tag.get_attribute('name').lower() == 'robots'),
                        None)
                # if we found a relevant meta tag, check for noarchive
                if meta_tag and 'noarchive' in meta_tag.get_attribute(
                        "content").lower():
                    save_fields(link, is_private=True, private_reason='policy')
                    print "Meta found, darchiving"

            add_thread(thread_list, meta_thread)

            # scroll to bottom of page, in case that prompts anything else to load
            # TODO: This doesn't scroll horizontally or scroll frames
            def scroll_browser():
                try:
                    scroll_delay = browser.execute_script("""
                        // Scroll down the page in a series of jumps the size of the window height.
                        // The actual scrolling is done in a setTimeout with a 50ms delay so the browser has
                        // time to render at each position.
                        var delay=50,
                            height=document.body.scrollHeight,
                            jump=window.innerHeight,
                            scrollTo=function(scrollY){ window.scrollTo(0, scrollY) },
                            i=1;
                        for(;i*jump<height;i++){
                            setTimeout(scrollTo, i*delay, i*jump);
                        }

                        // Scroll back to top before taking screenshot.
                        setTimeout(scrollTo, i*delay, 0);

                        // Return how long all this scrolling will take.
                        return (i*delay)/1000;
                    """)

                    # In python, wait for javascript background scrolling to finish.
                    time.sleep(min(scroll_delay, 1))
                except WebDriverException:
                    pass

            repeat_while_exception(scroll_browser)

        # make sure all requests are finished
        print "Waiting for post-load requests."
        start_time = time.time()
        time.sleep(1)
        while True:
            print "%s/%s finished" % (len(proxied_responses),
                                      len(proxied_requests))
            response_count = len(proxied_responses)
            if time.time() - start_time > AFTER_LOAD_TIMEOUT:
                print "Waited %s seconds to finish post-load requests -- giving up." % AFTER_LOAD_TIMEOUT
                break
            time.sleep(.5)
            if response_count == len(proxied_requests):
                break

        if have_html:
            # get page size to decide whether to take a screenshot
            capture_screenshot = False
            pixel_count = 0
            try:
                root_element = browser.find_element_by_tag_name('body')
            except NoSuchElementException:
                try:
                    root_element = browser.find_element_by_tag_name('frameset')
                except NoSuchElementException:
                    root_element = None
            if root_element:
                page_size = root_element.size
                pixel_count = page_size['width'] * page_size['height']
                capture_screenshot = pixel_count < settings.MAX_IMAGE_SIZE

            # take screenshot after all requests done
            if capture_screenshot:
                print "Taking screenshot."
                screenshot_data = browser.get_screenshot_as_png()
                link.screenshot_capture.write_warc_resource_record(
                    screenshot_data)
                save_fields(link.screenshot_capture, status='success')
            else:
                print "Not saving screenshots! %s" % (
                    "Page size is %s pixels." %
                    pixel_count if pixel_count else "")
                save_fields(link.screenshot_capture, status='failed')
        else:
            # no screenshot if not HTML
            save_fields(link.screenshot_capture, status='failed')

        have_warc = True

    except HaltCaptureException:
        pass

    except Exception as e:
        traceback.print_exc()
        raise

    finally:
        # teardown (have to do this before save to make sure WARC is done writing):
        print "Shutting down browser and proxies."

        for thread in thread_list:
            thread.join(
            )  # wait until threads are done (have to do this before closing phantomjs)
        if browser:
            browser.quit()  # shut down phantomjs
        if display:
            display.stop()  # shut down virtual display
        if warcprox_controller:
            warcprox_controller.stop.set(
            )  # send signal to shut down warc thread
        if warcprox_thread:
            warcprox_thread.join(
            )  # wait until warcprox thread is done writing out warc

        # un-suppress logging
        logging.disable(logging.NOTSET)

    # save generated warc file
    if have_warc:
        print "Saving WARC."
        try:
            temp_warc_path = os.path.join(warc_writer.directory,
                                          warc_writer._f_finalname)
            with open(temp_warc_path, 'rb') as warc_file:
                link.write_warc_raw_data(warc_file)
                save_fields(link.primary_capture,
                            status='success',
                            content_type=content_type)

            # We only save the Capture for the favicon once the warc is stored,
            # since the data for the favicon lives in the warc.
            if successful_favicon_urls:
                Capture(link=link,
                        role='favicon',
                        status='success',
                        record_type='response',
                        url=successful_favicon_urls[0][0],
                        content_type=successful_favicon_urls[0][1]).save()
                print "Saved favicon at %s" % successful_favicon_urls

            print "Writing CDX lines to the DB"
            CDXLine.objects.create_all_from_link(link)

        except Exception as e:
            print "Web Archive File creation failed for %s: %s" % (target_url,
                                                                   e)
            save_fields(link.primary_capture, status='failed')

    print "%s capture done." % link_guid