Esempio n. 1
0
class SearchImages(APIView):
    """
    Search for images by a query string. Optionally, filter results by specific
    licenses, or license "types" (commercial use allowed, modification allowed,
    etc). Results are ranked in order of relevance.

    Refer to the Lucene syntax guide for information on structuring advanced
    searches. https://lucene.apache.org/core/2_9_4/queryparsersyntax.html

    Although there may be millions of relevant records, only the most relevant
    several thousand records can be viewed. This is by design: the search
    endpoint should be used to find the top N most relevant results, not for
    exhaustive search or bulk download of every barely relevant result. As such,
    the caller should not try to access pages beyond `page_count`, or else the
    server will reject the query.
    """
    @swagger_auto_schema(operation_id='image_search',
                         query_serializer=ImageSearchQueryStringSerializer,
                         responses={
                             200: ImageSearchResultsSerializer(many=True),
                             400: ValidationErrorSerializer,
                         })
    def get(self, request, format=None):
        # Parse and validate query parameters
        params = ImageSearchQueryStringSerializer(data=request.query_params)
        if not params.is_valid():
            return Response(status=400,
                            data={"validation_error": params.errors})

        hashed_ip = hash(_get_user_ip(request))
        page_param = params.data[PAGE]
        page_size = params.data[PAGESIZE]
        qa = params.data[QA]
        filter_dead = params.data[FILTER_DEAD]

        search_index = 'search-qa' if qa else 'image'
        try:
            results, page_count, result_count = search_controller.search(
                params,
                search_index,
                page_size,
                hashed_ip,
                request,
                filter_dead,
                page=page_param)
        except ValueError as value_error_message:
            return Response(status=400,
                            data={VALIDATION_ERROR: str(value_error_message)})

        serialized_results = ImageSerializer(results, many=True).data

        if len(results) < page_size and page_count == 0:
            result_count = len(results)
        response_data = {
            RESULT_COUNT: result_count,
            PAGE_COUNT: page_count,
            RESULTS: serialized_results
        }
        serialized_response = ImageSearchResultsSerializer(data=response_data)
        return Response(status=200, data=serialized_response.initial_data)
Esempio n. 2
0
    def get(self, request, provider, format=None):
        params = BrowseImageQueryStringSerializer(data=request.query_params)
        if not params.is_valid():
            return Response(status=400,
                            data={"validation_error": params.errors})
        page_param = params.data[PAGE]
        page_size = params.data[PAGESIZE]
        filter_dead = params.data[FILTER_DEAD]
        lt = None
        li = None
        if 'lt' in params.data:
            lt = params.data['lt']
        elif 'li' in params.data:
            li = params.data['li']

        try:
            results, page_count, result_count = \
                search_controller.browse_by_provider(
                    provider,
                    'image',
                    page_size,
                    hash(_get_user_ip(request)),
                    request,
                    filter_dead,
                    page=page_param,
                    lt=lt,
                    li=li
                )
        except ValueError as value_error_message:
            return Response(status=400,
                            data={VALIDATION_ERROR: str(value_error_message)})
        except serializers.ValidationError:
            return Response(
                status=400,
                data={
                    VALIDATION_ERROR:
                    'Provider \'{}\' does not exist.'.format(provider)
                })

        serialized_results = ImageSerializer(results, many=True).data

        response_data = {
            'result_count': result_count,
            'page_count': page_count,
            RESULTS: serialized_results
        }
        serialized_response = ImageSearchResultsSerializer(data=response_data)
        return Response(status=200, data=serialized_response.initial_data)
Esempio n. 3
0
    def get(self, request, provider, format=None):
        params = BrowseImageQueryStringSerializer(data=request.query_params)
        if not params.is_valid():
            return Response(status=400,
                            data={"validation_error": params.errors})
        page_param = params.data[PAGE]
        page_size = params.data[PAGESIZE]
        lt = None
        li = None
        if 'lt' in params.data:
            lt = params.data['lt']
        elif 'li' in params.data:
            li = params.data['li']

        try:
            browse_results = search_controller.browse_by_provider(
                provider,
                index='image',
                page_size=page_size,
                page=page_param,
                lt=lt,
                li=li,
                ip=hash(_get_user_ip(request)))
        except ValueError:
            return Response(status=400,
                            data={VALIDATION_ERROR: DEEP_PAGINATION_ERROR})
        except serializers.ValidationError:
            return Response(
                status=400,
                data={
                    VALIDATION_ERROR:
                    'Provider \'{}\' does not exist.'.format(provider)
                })
        filter_dead = params.data[FILTER_DEAD]
        results = _post_process_results(browse_results, request, filter_dead)
        serialized_results = ImageSerializer(results, many=True).data
        page_count = _get_page_count(browse_results, page_size)
        response_data = {
            'result_count': browse_results.hits.total,
            'page_count': page_count,
            RESULTS: serialized_results
        }
        serialized_response = ImageSearchResultsSerializer(data=response_data)
        return Response(status=200, data=serialized_response.initial_data)
Esempio n. 4
0
    def get(self, request, format=None):
        # Parse and validate query parameters
        params = ImageSearchQueryStringSerializer(data=request.query_params)
        if not params.is_valid():
            return Response(status=400,
                            data={"validation_error": params.errors})
        page = params.data['page']
        page_size = params.data['pagesize']
        try:
            search_results = search_controller.search(params,
                                                      index='image',
                                                      page_size=page_size,
                                                      page=page)
        except ValueError:
            return Response(
                status=400,
                data={'validation_error': 'Deep pagination is not allowed.'})

        # Fetch each result from Elasticsearch. Resolve links to detail views.
        results = []
        for result in search_results:
            url = request.build_absolute_uri(
                reverse('image-detail', [result.id]))
            result.detail = url
            results.append(result)

        serialized_results =\
            ImageSerializer(results, many=True).data

        # Elasticsearch does not allow deep pagination of ranked queries.
        # Adjust returned page count to reflect this.
        natural_page_count = int(search_results.hits.total / page_size)
        last_allowed_page = int((5000 + page_size / 2) / page_size)
        page_count = min(natural_page_count, last_allowed_page)

        response_data = {
            'result_count': search_results.hits.total,
            'page_count': page_count,
            'results': serialized_results
        }
        serialized_response = ImageSearchResultsSerializer(data=response_data)

        return Response(status=200, data=serialized_response.initial_data)
Esempio n. 5
0
    def get(self, request, format=None):
        # Parse and validate query parameters
        params = ImageSearchQueryStringSerializer(data=request.query_params)
        if not params.is_valid():
            return Response(status=400,
                            data={"validation_error": params.errors})

        hashed_ip = hash(_get_user_ip(request))
        page_param = params.data[PAGE]
        page_size = params.data[PAGESIZE]
        qa = params.data[QA]
        filter_dead = params.data[FILTER_DEAD]

        search_index = 'search-qa' if qa else 'image'
        try:
            results, page_count, result_count = search_controller.search(
                params,
                search_index,
                page_size,
                hashed_ip,
                request,
                filter_dead,
                page=page_param)
        except ValueError as value_error_message:
            return Response(status=400,
                            data={VALIDATION_ERROR: str(value_error_message)})

        serialized_results = ImageSerializer(results, many=True).data

        if len(results) < page_size and page_count == 0:
            result_count = len(results)
        response_data = {
            RESULT_COUNT: result_count,
            PAGE_COUNT: page_count,
            RESULTS: serialized_results
        }
        serialized_response = ImageSearchResultsSerializer(data=response_data)
        return Response(status=200, data=serialized_response.initial_data)
Esempio n. 6
0
    def get(self, request, format=None):
        # Parse and validate query parameters
        params = ImageSearchQueryStringSerializer(data=request.query_params)
        if not params.is_valid():
            return Response(
                status=400,
                data={
                    "validation_error": params.errors
                }
            )

        hashed_ip = hash(_get_user_ip(request))
        page_param = params.data[PAGE]
        page_size = params.data[PAGESIZE]

        try:
            search_results = search_controller.search(params,
                                                      index='image',
                                                      page_size=page_size,
                                                      ip=hashed_ip,
                                                      page=page_param)
        except ValueError:
            return Response(
                status=400,
                data={
                    VALIDATION_ERROR: 'Deep pagination is not allowed.'
                }
            )

        # Fetch each result from Elasticsearch. Resolve links to detail views.
        results = []
        to_validate = []
        for result in search_results:
            url = request.build_absolute_uri(
                reverse('image-detail', [result.identifier])
            )
            result.detail = url
            to_validate.append(result.url)
            results.append(result)
        if params.data[FILTER_DEAD]:
            validate_images(results, to_validate)
        serialized_results =\
            ImageSerializer(results, many=True).data
        # Elasticsearch does not allow deep pagination of ranked queries.
        # Adjust returned page count to reflect this.
        natural_page_count = int(search_results.hits.total / page_size)
        last_allowed_page = int((5000 + page_size / 2) / page_size)
        page_count = min(natural_page_count, last_allowed_page)

        result_count = search_results.hits.total
        if len(results) < page_size and page_count == 0:
            result_count = len(results)
        response_data = {
            'result_count': result_count,
            'page_count': page_count,
            RESULTS: serialized_results
        }
        # Post-process the search results to fix malformed URLs and insecure
        # HTTP thumbnails.
        for idx, res in enumerate(serialized_results):
            if PROXY_THUMBS:
                provider = res[PROVIDER]
                # Proxy either the thumbnail or URL, depending on whether
                # a thumbnail was provided.
                if THUMBNAIL in res and provider not in PROXY_ALL:
                    to_proxy = THUMBNAIL
                else:
                    to_proxy = URL
                if 'http://' in res[to_proxy] or provider in PROXY_ALL:
                    original = res[to_proxy]
                    secure = '{proxy_url}/{width}/{original}'.format(
                        proxy_url=THUMBNAIL_PROXY_URL,
                        width=THUMBNAIL_WIDTH_PX,
                        original=original
                    )
                    response_data[RESULTS][idx][THUMBNAIL] = secure
            if FOREIGN_LANDING_URL in res:
                foreign = _add_protocol(res[FOREIGN_LANDING_URL])
                response_data[RESULTS][idx][FOREIGN_LANDING_URL] = foreign
            if CREATOR_URL in res:
                creator_url = _add_protocol(res[CREATOR_URL])
                response_data[RESULTS][idx][CREATOR_URL] = creator_url
        serialized_response = ImageSearchResultsSerializer(data=response_data)

        return Response(status=200, data=serialized_response.initial_data)
Esempio n. 7
0
class SearchImages(APIView):
    """
    Search for images by keyword. Optionally, filter the results by specific
    licenses, or license "types" (commercial use allowed, modification allowed,
    etc). Results are ranked in order of relevance.

    Although there may be millions of relevant records, only the most relevant
    several thousand records can be viewed. This is by design: the search
    endpoint should be used to find the top N most relevant results, not for
    exhaustive search or bulk download of every barely relevant result. As such,
    the caller should not try to access pages beyond `page_count`, or else the
    server will reject the query.
    """

    @swagger_auto_schema(operation_id='image_search',
                         query_serializer=ImageSearchQueryStringSerializer,
                         responses={
                             200: ImageSearchResultsSerializer(many=True),
                             400: ValidationErrorSerializer,
                         })
    def get(self, request, format=None):
        # Parse and validate query parameters
        params = ImageSearchQueryStringSerializer(data=request.query_params)
        if not params.is_valid():
            return Response(
                status=400,
                data={
                    "validation_error": params.errors
                }
            )

        hashed_ip = hash(_get_user_ip(request))
        page_param = params.data[PAGE]
        page_size = params.data[PAGESIZE]

        try:
            search_results = search_controller.search(params,
                                                      index='image',
                                                      page_size=page_size,
                                                      ip=hashed_ip,
                                                      page=page_param)
        except ValueError:
            return Response(
                status=400,
                data={
                    VALIDATION_ERROR: 'Deep pagination is not allowed.'
                }
            )

        # Fetch each result from Elasticsearch. Resolve links to detail views.
        results = []
        to_validate = []
        for result in search_results:
            url = request.build_absolute_uri(
                reverse('image-detail', [result.identifier])
            )
            result.detail = url
            to_validate.append(result.url)
            results.append(result)
        if params.data[FILTER_DEAD]:
            validate_images(results, to_validate)
        serialized_results =\
            ImageSerializer(results, many=True).data
        # Elasticsearch does not allow deep pagination of ranked queries.
        # Adjust returned page count to reflect this.
        natural_page_count = int(search_results.hits.total / page_size)
        last_allowed_page = int((5000 + page_size / 2) / page_size)
        page_count = min(natural_page_count, last_allowed_page)

        result_count = search_results.hits.total
        if len(results) < page_size and page_count == 0:
            result_count = len(results)
        response_data = {
            'result_count': result_count,
            'page_count': page_count,
            RESULTS: serialized_results
        }
        # Post-process the search results to fix malformed URLs and insecure
        # HTTP thumbnails.
        for idx, res in enumerate(serialized_results):
            if PROXY_THUMBS:
                provider = res[PROVIDER]
                # Proxy either the thumbnail or URL, depending on whether
                # a thumbnail was provided.
                if THUMBNAIL in res and provider not in PROXY_ALL:
                    to_proxy = THUMBNAIL
                else:
                    to_proxy = URL
                if 'http://' in res[to_proxy] or provider in PROXY_ALL:
                    original = res[to_proxy]
                    secure = '{proxy_url}/{width}/{original}'.format(
                        proxy_url=THUMBNAIL_PROXY_URL,
                        width=THUMBNAIL_WIDTH_PX,
                        original=original
                    )
                    response_data[RESULTS][idx][THUMBNAIL] = secure
            if FOREIGN_LANDING_URL in res:
                foreign = _add_protocol(res[FOREIGN_LANDING_URL])
                response_data[RESULTS][idx][FOREIGN_LANDING_URL] = foreign
            if CREATOR_URL in res:
                creator_url = _add_protocol(res[CREATOR_URL])
                response_data[RESULTS][idx][CREATOR_URL] = creator_url
        serialized_response = ImageSearchResultsSerializer(data=response_data)

        return Response(status=200, data=serialized_response.initial_data)
Esempio n. 8
0
class BrowseImages(APIView):
    """
    Browse a collection of CC images by provider, such as the Metropolitan
    Museum of Art.. See `/statistics/image` for a list of valid
    collections. The `provider_identifier` field should be used to select
    the provider.

    As with the `/image/search` endpoint, this is not intended to be used to
    bulk download our entire collection of images; only the first ~10,000 images
    in each collection are accessible.
    """
    @swagger_auto_schema(operation_id='image_browse',
                         query_serializer=BrowseImageQueryStringSerializer,
                         responses={
                             200: ImageSearchResultsSerializer(many=True),
                             400: ValidationErrorSerializer,
                         })
    def get(self, request, provider, format=None):
        params = BrowseImageQueryStringSerializer(data=request.query_params)
        if not params.is_valid():
            return Response(status=400,
                            data={"validation_error": params.errors})
        page_param = params.data[PAGE]
        page_size = params.data[PAGESIZE]
        lt = None
        li = None
        if 'lt' in params.data:
            lt = params.data['lt']
        elif 'li' in params.data:
            li = params.data['li']

        try:
            browse_results = search_controller.browse_by_provider(
                provider,
                index='image',
                page_size=page_size,
                page=page_param,
                lt=lt,
                li=li,
                ip=hash(_get_user_ip(request)))
        except ValueError:
            return Response(status=400,
                            data={VALIDATION_ERROR: DEEP_PAGINATION_ERROR})
        except serializers.ValidationError:
            return Response(
                status=400,
                data={
                    VALIDATION_ERROR:
                    'Provider \'{}\' does not exist.'.format(provider)
                })
        filter_dead = params.data[FILTER_DEAD]
        results = _post_process_results(browse_results, request, filter_dead)
        serialized_results = ImageSerializer(results, many=True).data
        page_count = _get_page_count(browse_results, page_size)
        response_data = {
            'result_count': browse_results.hits.total,
            'page_count': page_count,
            RESULTS: serialized_results
        }
        serialized_response = ImageSearchResultsSerializer(data=response_data)
        return Response(status=200, data=serialized_response.initial_data)
Esempio n. 9
0
class SearchImages(APIView):
    """
    Search for images by keyword. Optionally, filter the results by specific
    licenses, or license "types" (commercial use allowed, modification allowed,
    etc). Results are ranked in order of relevance.

    Although there may be millions of relevant records, only the most relevant
    several thousand records can be viewed. This is by design: the search
    endpoint should be used to find the top N most relevant results, not for
    exhaustive search or bulk download of every barely relevant result. As such,
    the caller should not try to access pages beyond `page_count`, or else the
    server will reject the query.
    """
    @swagger_auto_schema(operation_id='image_search',
                         query_serializer=ImageSearchQueryStringSerializer,
                         responses={
                             200: ImageSearchResultsSerializer(many=True),
                             400: ValidationErrorSerializer,
                         })
    def get(self, request, format=None):
        # Parse and validate query parameters
        params = ImageSearchQueryStringSerializer(data=request.query_params)
        if not params.is_valid():
            return Response(status=400,
                            data={"validation_error": params.errors})
        page = params.data['page']
        page_size = params.data['pagesize']
        try:
            search_results = search_controller.search(params,
                                                      index='image',
                                                      page_size=page_size,
                                                      page=page)
        except ValueError:
            return Response(
                status=400,
                data={'validation_error': 'Deep pagination is not allowed.'})

        # Fetch each result from Elasticsearch. Resolve links to detail views.
        results = []
        for result in search_results:
            url = request.build_absolute_uri(
                reverse('image-detail', [result.id]))
            result.detail = url
            results.append(result)

        serialized_results =\
            ImageSerializer(results, many=True).data

        # Elasticsearch does not allow deep pagination of ranked queries.
        # Adjust returned page count to reflect this.
        natural_page_count = int(search_results.hits.total / page_size)
        last_allowed_page = int((5000 + page_size / 2) / page_size)
        page_count = min(natural_page_count, last_allowed_page)

        response_data = {
            'result_count': search_results.hits.total,
            'page_count': page_count,
            'results': serialized_results
        }
        serialized_response = ImageSearchResultsSerializer(data=response_data)

        return Response(status=200, data=serialized_response.initial_data)