def _post_process_results(s, start, end, page_size, search_results, request,
                          filter_dead) -> List[Hit]:
    """
    After fetching the search results from the back end, iterate through the
    results, perform image validation, and route certain thumbnails through our
    proxy.

    :param s: The Elasticsearch Search object.
    :param start: The start of the result slice.
    :param end: The end of the result slice.
    :param search_results: The Elasticsearch response object containing search
    results.
    :param request: The Django request object, used to build a "reversed" URL
    to detail pages.
    :param filter_dead: Whether images should be validated.
    :return: List of results.
    """
    results = []
    to_validate = []
    for res in search_results:
        if hasattr(res.meta, 'highlight'):
            res.fields_matched = dir(res.meta.highlight)
        to_validate.append(res.url)
        if PROXY_THUMBS:
            # Route all images through a dynamically resizing caching proxy.
            # If a 3rd party thumbnail is available, in order to save limited
            # bandwidth and memory resources required for resizing, we'll
            # proxy the 3rd party thumbnail instead of the full-sized image.
            if THUMBNAIL in res:
                to_proxy = THUMBNAIL
            else:
                to_proxy = URL
            original = res[to_proxy]
            ext = res["url"].split(".")[-1]
            proxied = "http://{}{}".format(
                request.get_host(),
                reverse('thumbs',
                        kwargs={
                            'identifier':
                            "{}.{}".format(res["identifier"], ext)
                        }))
            res[THUMBNAIL] = proxied
        results.append(res)

    if filter_dead:
        query_hash = get_query_hash(s)
        validate_images(query_hash, start, results, to_validate)

        if len(results) < page_size:
            end += int(end / 2)
            if start + end > ELASTICSEARCH_MAX_RESULT_WINDOW:
                return results

            s = s[start:end]
            search_response = s.execute()

            return _post_process_results(s, start, end, page_size,
                                         search_response, request, filter_dead)
    return results[:page_size]
Ejemplo n.º 2
0
def _post_process_results(s, start, end, page_size, search_results, request,
                          filter_dead) -> List[Hit]:
    """
    After fetching the search results from the back end, iterate through the
    results, add links to detail views, perform image validation, and route
    certain thumbnails through out proxy.

    :param s: The Elasticsearch Search object.
    :param start: The start of the result slice.
    :param end: The end of the result slice.
    :param search_results: The Elasticsearch response object containing search
    results.
    :param request: The Django request object, used to build a "reversed" URL
    to detail pages.
    :param filter_dead: Whether images should be validated.
    :return: List of results.
    """
    results = []
    to_validate = []
    for res in search_results:
        url = request.build_absolute_uri(
            reverse('image-detail', [res.identifier]))
        res.detail = url
        if hasattr(res.meta, 'highlight'):
            res.fields_matched = dir(res.meta.highlight)
        to_validate.append(res.url)
        if PROXY_THUMBS:
            # Proxy thumbnails from providers who don't provide SSL. We also
            # have a list of providers that have poor quality or no thumbnails,
            # so we produce our own on-the-fly.
            provider = res[PROVIDER]
            if THUMBNAIL in res and provider not in PROXY_ALL:
                to_proxy = THUMBNAIL
            else:
                to_proxy = URL
            if 'http://' in res[to_proxy] or provider in PROXY_ALL:
                original = res[to_proxy]
                secure = '{proxy_url}/{width}/{original}'.format(
                    proxy_url=THUMBNAIL_PROXY_URL,
                    width=THUMBNAIL_WIDTH_PX,
                    original=original)
                res[THUMBNAIL] = secure
        results.append(res)

    if filter_dead:
        query_hash = get_query_hash(s)
        validate_images(query_hash, start, results, to_validate)

        if len(results) < page_size:
            end += int(end / 2)
            if start + end > ELASTICSEARCH_MAX_RESULT_WINDOW:
                return results

            s = s[start:end]
            search_response = s.execute()

            return _post_process_results(s, start, end, page_size,
                                         search_response, request, filter_dead)
    return results[:page_size]
def _post_process_results(s, start, end, page_size, search_results, request,
                          filter_dead) -> List[Hit]:
    """
    After fetching the search results from the back end, iterate through the
    results, perform image validation, and route certain thumbnails through our
    proxy.

    :param s: The Elasticsearch Search object.
    :param start: The start of the result slice.
    :param end: The end of the result slice.
    :param search_results: The Elasticsearch response object containing search
    results.
    :param request: The Django request object, used to build a "reversed" URL
    to detail pages.
    :param filter_dead: Whether images should be validated.
    :return: List of results.
    """
    results = []
    to_validate = []
    for res in search_results:
        if hasattr(res.meta, 'highlight'):
            res.fields_matched = dir(res.meta.highlight)
        to_validate.append(res.url)
        if PROXY_THUMBS:
            # Route all images through a dynamically resizing caching proxy.
            proxied = "https://{}{}".format(
                request.get_host(),
                reverse('thumbs', kwargs={'identifier': res["identifier"]}))
            res[THUMBNAIL] = proxied
        results.append(res)

    if filter_dead:
        query_hash = get_query_hash(s)
        validate_images(query_hash, start, results, to_validate)

        if len(results) < page_size:
            end += int(end / 2)
            if start + end > ELASTICSEARCH_MAX_RESULT_WINDOW:
                return results

            s = s[start:end]
            search_response = s.execute()

            return _post_process_results(s, start, end, page_size,
                                         search_response, request, filter_dead)
    return results[:page_size]
Ejemplo n.º 4
0
def _post_process_results(search_results, request, filter_dead):
    """
    After fetching the search results from the back end, iterate through the
    results, add links to detail views, perform image validation, and route
    certain thumbnails through out proxy.
    :param search_results: The Elasticsearch response object containing search
    results.
    :param request: The Django request object, used to build a "reversed" URL
    to detail pages.
    :param filter_dead: Whether images should be validated.
    """
    results = []
    to_validate = []
    for res in search_results:
        url = request.build_absolute_uri(
            reverse('image-detail', [res.identifier]))
        res.detail = url
        if hasattr(res.meta, 'highlight'):
            res.fields_matched = dir(res.meta.highlight)
        to_validate.append(res.url)
        if PROXY_THUMBS:
            # Proxy thumbnails from providers who don't provide SSL. We also
            # have a list of providers that have poor quality or no thumbnails,
            # so we produce our own on-the-fly.
            provider = res[PROVIDER]
            if THUMBNAIL in res and provider not in PROXY_ALL:
                to_proxy = THUMBNAIL
            else:
                to_proxy = URL
            if 'http://' in res[to_proxy] or provider in PROXY_ALL:
                original = res[to_proxy]
                secure = '{proxy_url}/{width}/{original}'.format(
                    proxy_url=THUMBNAIL_PROXY_URL,
                    width=THUMBNAIL_WIDTH_PX,
                    original=original)
                res[THUMBNAIL] = secure
        results.append(res)
    if filter_dead:
        validate_images(results, to_validate)
    return results
Ejemplo n.º 5
0
    def get(self, request, format=None):
        # Parse and validate query parameters
        params = ImageSearchQueryStringSerializer(data=request.query_params)
        if not params.is_valid():
            return Response(
                status=400,
                data={
                    "validation_error": params.errors
                }
            )

        hashed_ip = hash(_get_user_ip(request))
        page_param = params.data[PAGE]
        page_size = params.data[PAGESIZE]

        try:
            search_results = search_controller.search(params,
                                                      index='image',
                                                      page_size=page_size,
                                                      ip=hashed_ip,
                                                      page=page_param)
        except ValueError:
            return Response(
                status=400,
                data={
                    VALIDATION_ERROR: 'Deep pagination is not allowed.'
                }
            )

        # Fetch each result from Elasticsearch. Resolve links to detail views.
        results = []
        to_validate = []
        for result in search_results:
            url = request.build_absolute_uri(
                reverse('image-detail', [result.identifier])
            )
            result.detail = url
            to_validate.append(result.url)
            results.append(result)
        if params.data[FILTER_DEAD]:
            validate_images(results, to_validate)
        serialized_results =\
            ImageSerializer(results, many=True).data
        # Elasticsearch does not allow deep pagination of ranked queries.
        # Adjust returned page count to reflect this.
        natural_page_count = int(search_results.hits.total / page_size)
        last_allowed_page = int((5000 + page_size / 2) / page_size)
        page_count = min(natural_page_count, last_allowed_page)

        result_count = search_results.hits.total
        if len(results) < page_size and page_count == 0:
            result_count = len(results)
        response_data = {
            'result_count': result_count,
            'page_count': page_count,
            RESULTS: serialized_results
        }
        # Post-process the search results to fix malformed URLs and insecure
        # HTTP thumbnails.
        for idx, res in enumerate(serialized_results):
            if PROXY_THUMBS:
                provider = res[PROVIDER]
                # Proxy either the thumbnail or URL, depending on whether
                # a thumbnail was provided.
                if THUMBNAIL in res and provider not in PROXY_ALL:
                    to_proxy = THUMBNAIL
                else:
                    to_proxy = URL
                if 'http://' in res[to_proxy] or provider in PROXY_ALL:
                    original = res[to_proxy]
                    secure = '{proxy_url}/{width}/{original}'.format(
                        proxy_url=THUMBNAIL_PROXY_URL,
                        width=THUMBNAIL_WIDTH_PX,
                        original=original
                    )
                    response_data[RESULTS][idx][THUMBNAIL] = secure
            if FOREIGN_LANDING_URL in res:
                foreign = _add_protocol(res[FOREIGN_LANDING_URL])
                response_data[RESULTS][idx][FOREIGN_LANDING_URL] = foreign
            if CREATOR_URL in res:
                creator_url = _add_protocol(res[CREATOR_URL])
                response_data[RESULTS][idx][CREATOR_URL] = creator_url
        serialized_response = ImageSearchResultsSerializer(data=response_data)

        return Response(status=200, data=serialized_response.initial_data)